]> git.proxmox.com Git - mirror_zfs.git/blob - module/icp/asm-x86_64/blake3/blake3_sse41.S
Introduce BLAKE3 checksums as an OpenZFS feature
[mirror_zfs.git] / module / icp / asm-x86_64 / blake3 / blake3_sse41.S
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 */
27
28 #if defined(HAVE_SSE4_1)
29
30 #define _ASM
31 #include <sys/asm_linkage.h>
32
33 #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
34 #if __has_include(<cet.h>)
35 #include <cet.h>
36 #endif
37 #endif
38
39 #if !defined(_CET_ENDBR)
40 #define _CET_ENDBR
41 #endif
42
43 .intel_syntax noprefix
44 .global zfs_blake3_compress_in_place_sse41
45 .global zfs_blake3_compress_xof_sse41
46 .global zfs_blake3_hash_many_sse41
47
48 .text
49 .type zfs_blake3_hash_many_sse41,@function
50 .type zfs_blake3_compress_in_place_sse41,@function
51 .type zfs_blake3_compress_xof_sse41,@function
52
53 .p2align 6
54 zfs_blake3_hash_many_sse41:
55 _CET_ENDBR
56 push r15
57 push r14
58 push r13
59 push r12
60 push rbx
61 push rbp
62 mov rbp, rsp
63 sub rsp, 360
64 and rsp, 0xFFFFFFFFFFFFFFC0
65 neg r9d
66 movd xmm0, r9d
67 pshufd xmm0, xmm0, 0x00
68 movdqa xmmword ptr [rsp+0x130], xmm0
69 movdqa xmm1, xmm0
70 pand xmm1, xmmword ptr [ADD0+rip]
71 pand xmm0, xmmword ptr [ADD1+rip]
72 movdqa xmmword ptr [rsp+0x150], xmm0
73 movd xmm0, r8d
74 pshufd xmm0, xmm0, 0x00
75 paddd xmm0, xmm1
76 movdqa xmmword ptr [rsp+0x110], xmm0
77 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
78 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
79 pcmpgtd xmm1, xmm0
80 shr r8, 32
81 movd xmm2, r8d
82 pshufd xmm2, xmm2, 0x00
83 psubd xmm2, xmm1
84 movdqa xmmword ptr [rsp+0x120], xmm2
85 mov rbx, qword ptr [rbp+0x50]
86 mov r15, rdx
87 shl r15, 6
88 movzx r13d, byte ptr [rbp+0x38]
89 movzx r12d, byte ptr [rbp+0x48]
90 cmp rsi, 4
91 jc 3f
92 2:
93 movdqu xmm3, xmmword ptr [rcx]
94 pshufd xmm0, xmm3, 0x00
95 pshufd xmm1, xmm3, 0x55
96 pshufd xmm2, xmm3, 0xAA
97 pshufd xmm3, xmm3, 0xFF
98 movdqu xmm7, xmmword ptr [rcx+0x10]
99 pshufd xmm4, xmm7, 0x00
100 pshufd xmm5, xmm7, 0x55
101 pshufd xmm6, xmm7, 0xAA
102 pshufd xmm7, xmm7, 0xFF
103 mov r8, qword ptr [rdi]
104 mov r9, qword ptr [rdi+0x8]
105 mov r10, qword ptr [rdi+0x10]
106 mov r11, qword ptr [rdi+0x18]
107 movzx eax, byte ptr [rbp+0x40]
108 or eax, r13d
109 xor edx, edx
110 9:
111 mov r14d, eax
112 or eax, r12d
113 add rdx, 64
114 cmp rdx, r15
115 cmovne eax, r14d
116 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
117 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
118 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
119 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
120 movdqa xmm12, xmm8
121 punpckldq xmm8, xmm9
122 punpckhdq xmm12, xmm9
123 movdqa xmm14, xmm10
124 punpckldq xmm10, xmm11
125 punpckhdq xmm14, xmm11
126 movdqa xmm9, xmm8
127 punpcklqdq xmm8, xmm10
128 punpckhqdq xmm9, xmm10
129 movdqa xmm13, xmm12
130 punpcklqdq xmm12, xmm14
131 punpckhqdq xmm13, xmm14
132 movdqa xmmword ptr [rsp], xmm8
133 movdqa xmmword ptr [rsp+0x10], xmm9
134 movdqa xmmword ptr [rsp+0x20], xmm12
135 movdqa xmmword ptr [rsp+0x30], xmm13
136 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
137 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
138 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
139 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
140 movdqa xmm12, xmm8
141 punpckldq xmm8, xmm9
142 punpckhdq xmm12, xmm9
143 movdqa xmm14, xmm10
144 punpckldq xmm10, xmm11
145 punpckhdq xmm14, xmm11
146 movdqa xmm9, xmm8
147 punpcklqdq xmm8, xmm10
148 punpckhqdq xmm9, xmm10
149 movdqa xmm13, xmm12
150 punpcklqdq xmm12, xmm14
151 punpckhqdq xmm13, xmm14
152 movdqa xmmword ptr [rsp+0x40], xmm8
153 movdqa xmmword ptr [rsp+0x50], xmm9
154 movdqa xmmword ptr [rsp+0x60], xmm12
155 movdqa xmmword ptr [rsp+0x70], xmm13
156 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
157 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
158 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
159 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
160 movdqa xmm12, xmm8
161 punpckldq xmm8, xmm9
162 punpckhdq xmm12, xmm9
163 movdqa xmm14, xmm10
164 punpckldq xmm10, xmm11
165 punpckhdq xmm14, xmm11
166 movdqa xmm9, xmm8
167 punpcklqdq xmm8, xmm10
168 punpckhqdq xmm9, xmm10
169 movdqa xmm13, xmm12
170 punpcklqdq xmm12, xmm14
171 punpckhqdq xmm13, xmm14
172 movdqa xmmword ptr [rsp+0x80], xmm8
173 movdqa xmmword ptr [rsp+0x90], xmm9
174 movdqa xmmword ptr [rsp+0xA0], xmm12
175 movdqa xmmword ptr [rsp+0xB0], xmm13
176 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
177 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
178 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
179 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
180 movdqa xmm12, xmm8
181 punpckldq xmm8, xmm9
182 punpckhdq xmm12, xmm9
183 movdqa xmm14, xmm10
184 punpckldq xmm10, xmm11
185 punpckhdq xmm14, xmm11
186 movdqa xmm9, xmm8
187 punpcklqdq xmm8, xmm10
188 punpckhqdq xmm9, xmm10
189 movdqa xmm13, xmm12
190 punpcklqdq xmm12, xmm14
191 punpckhqdq xmm13, xmm14
192 movdqa xmmword ptr [rsp+0xC0], xmm8
193 movdqa xmmword ptr [rsp+0xD0], xmm9
194 movdqa xmmword ptr [rsp+0xE0], xmm12
195 movdqa xmmword ptr [rsp+0xF0], xmm13
196 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
197 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
198 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
199 movdqa xmm12, xmmword ptr [rsp+0x110]
200 movdqa xmm13, xmmword ptr [rsp+0x120]
201 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
202 movd xmm15, eax
203 pshufd xmm15, xmm15, 0x00
204 prefetcht0 [r8+rdx+0x80]
205 prefetcht0 [r9+rdx+0x80]
206 prefetcht0 [r10+rdx+0x80]
207 prefetcht0 [r11+rdx+0x80]
208 paddd xmm0, xmmword ptr [rsp]
209 paddd xmm1, xmmword ptr [rsp+0x20]
210 paddd xmm2, xmmword ptr [rsp+0x40]
211 paddd xmm3, xmmword ptr [rsp+0x60]
212 paddd xmm0, xmm4
213 paddd xmm1, xmm5
214 paddd xmm2, xmm6
215 paddd xmm3, xmm7
216 pxor xmm12, xmm0
217 pxor xmm13, xmm1
218 pxor xmm14, xmm2
219 pxor xmm15, xmm3
220 movdqa xmm8, xmmword ptr [ROT16+rip]
221 pshufb xmm12, xmm8
222 pshufb xmm13, xmm8
223 pshufb xmm14, xmm8
224 pshufb xmm15, xmm8
225 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
226 paddd xmm8, xmm12
227 paddd xmm9, xmm13
228 paddd xmm10, xmm14
229 paddd xmm11, xmm15
230 pxor xmm4, xmm8
231 pxor xmm5, xmm9
232 pxor xmm6, xmm10
233 pxor xmm7, xmm11
234 movdqa xmmword ptr [rsp+0x100], xmm8
235 movdqa xmm8, xmm4
236 psrld xmm8, 12
237 pslld xmm4, 20
238 por xmm4, xmm8
239 movdqa xmm8, xmm5
240 psrld xmm8, 12
241 pslld xmm5, 20
242 por xmm5, xmm8
243 movdqa xmm8, xmm6
244 psrld xmm8, 12
245 pslld xmm6, 20
246 por xmm6, xmm8
247 movdqa xmm8, xmm7
248 psrld xmm8, 12
249 pslld xmm7, 20
250 por xmm7, xmm8
251 paddd xmm0, xmmword ptr [rsp+0x10]
252 paddd xmm1, xmmword ptr [rsp+0x30]
253 paddd xmm2, xmmword ptr [rsp+0x50]
254 paddd xmm3, xmmword ptr [rsp+0x70]
255 paddd xmm0, xmm4
256 paddd xmm1, xmm5
257 paddd xmm2, xmm6
258 paddd xmm3, xmm7
259 pxor xmm12, xmm0
260 pxor xmm13, xmm1
261 pxor xmm14, xmm2
262 pxor xmm15, xmm3
263 movdqa xmm8, xmmword ptr [ROT8+rip]
264 pshufb xmm12, xmm8
265 pshufb xmm13, xmm8
266 pshufb xmm14, xmm8
267 pshufb xmm15, xmm8
268 movdqa xmm8, xmmword ptr [rsp+0x100]
269 paddd xmm8, xmm12
270 paddd xmm9, xmm13
271 paddd xmm10, xmm14
272 paddd xmm11, xmm15
273 pxor xmm4, xmm8
274 pxor xmm5, xmm9
275 pxor xmm6, xmm10
276 pxor xmm7, xmm11
277 movdqa xmmword ptr [rsp+0x100], xmm8
278 movdqa xmm8, xmm4
279 psrld xmm8, 7
280 pslld xmm4, 25
281 por xmm4, xmm8
282 movdqa xmm8, xmm5
283 psrld xmm8, 7
284 pslld xmm5, 25
285 por xmm5, xmm8
286 movdqa xmm8, xmm6
287 psrld xmm8, 7
288 pslld xmm6, 25
289 por xmm6, xmm8
290 movdqa xmm8, xmm7
291 psrld xmm8, 7
292 pslld xmm7, 25
293 por xmm7, xmm8
294 paddd xmm0, xmmword ptr [rsp+0x80]
295 paddd xmm1, xmmword ptr [rsp+0xA0]
296 paddd xmm2, xmmword ptr [rsp+0xC0]
297 paddd xmm3, xmmword ptr [rsp+0xE0]
298 paddd xmm0, xmm5
299 paddd xmm1, xmm6
300 paddd xmm2, xmm7
301 paddd xmm3, xmm4
302 pxor xmm15, xmm0
303 pxor xmm12, xmm1
304 pxor xmm13, xmm2
305 pxor xmm14, xmm3
306 movdqa xmm8, xmmword ptr [ROT16+rip]
307 pshufb xmm15, xmm8
308 pshufb xmm12, xmm8
309 pshufb xmm13, xmm8
310 pshufb xmm14, xmm8
311 paddd xmm10, xmm15
312 paddd xmm11, xmm12
313 movdqa xmm8, xmmword ptr [rsp+0x100]
314 paddd xmm8, xmm13
315 paddd xmm9, xmm14
316 pxor xmm5, xmm10
317 pxor xmm6, xmm11
318 pxor xmm7, xmm8
319 pxor xmm4, xmm9
320 movdqa xmmword ptr [rsp+0x100], xmm8
321 movdqa xmm8, xmm5
322 psrld xmm8, 12
323 pslld xmm5, 20
324 por xmm5, xmm8
325 movdqa xmm8, xmm6
326 psrld xmm8, 12
327 pslld xmm6, 20
328 por xmm6, xmm8
329 movdqa xmm8, xmm7
330 psrld xmm8, 12
331 pslld xmm7, 20
332 por xmm7, xmm8
333 movdqa xmm8, xmm4
334 psrld xmm8, 12
335 pslld xmm4, 20
336 por xmm4, xmm8
337 paddd xmm0, xmmword ptr [rsp+0x90]
338 paddd xmm1, xmmword ptr [rsp+0xB0]
339 paddd xmm2, xmmword ptr [rsp+0xD0]
340 paddd xmm3, xmmword ptr [rsp+0xF0]
341 paddd xmm0, xmm5
342 paddd xmm1, xmm6
343 paddd xmm2, xmm7
344 paddd xmm3, xmm4
345 pxor xmm15, xmm0
346 pxor xmm12, xmm1
347 pxor xmm13, xmm2
348 pxor xmm14, xmm3
349 movdqa xmm8, xmmword ptr [ROT8+rip]
350 pshufb xmm15, xmm8
351 pshufb xmm12, xmm8
352 pshufb xmm13, xmm8
353 pshufb xmm14, xmm8
354 paddd xmm10, xmm15
355 paddd xmm11, xmm12
356 movdqa xmm8, xmmword ptr [rsp+0x100]
357 paddd xmm8, xmm13
358 paddd xmm9, xmm14
359 pxor xmm5, xmm10
360 pxor xmm6, xmm11
361 pxor xmm7, xmm8
362 pxor xmm4, xmm9
363 movdqa xmmword ptr [rsp+0x100], xmm8
364 movdqa xmm8, xmm5
365 psrld xmm8, 7
366 pslld xmm5, 25
367 por xmm5, xmm8
368 movdqa xmm8, xmm6
369 psrld xmm8, 7
370 pslld xmm6, 25
371 por xmm6, xmm8
372 movdqa xmm8, xmm7
373 psrld xmm8, 7
374 pslld xmm7, 25
375 por xmm7, xmm8
376 movdqa xmm8, xmm4
377 psrld xmm8, 7
378 pslld xmm4, 25
379 por xmm4, xmm8
380 paddd xmm0, xmmword ptr [rsp+0x20]
381 paddd xmm1, xmmword ptr [rsp+0x30]
382 paddd xmm2, xmmword ptr [rsp+0x70]
383 paddd xmm3, xmmword ptr [rsp+0x40]
384 paddd xmm0, xmm4
385 paddd xmm1, xmm5
386 paddd xmm2, xmm6
387 paddd xmm3, xmm7
388 pxor xmm12, xmm0
389 pxor xmm13, xmm1
390 pxor xmm14, xmm2
391 pxor xmm15, xmm3
392 movdqa xmm8, xmmword ptr [ROT16+rip]
393 pshufb xmm12, xmm8
394 pshufb xmm13, xmm8
395 pshufb xmm14, xmm8
396 pshufb xmm15, xmm8
397 movdqa xmm8, xmmword ptr [rsp+0x100]
398 paddd xmm8, xmm12
399 paddd xmm9, xmm13
400 paddd xmm10, xmm14
401 paddd xmm11, xmm15
402 pxor xmm4, xmm8
403 pxor xmm5, xmm9
404 pxor xmm6, xmm10
405 pxor xmm7, xmm11
406 movdqa xmmword ptr [rsp+0x100], xmm8
407 movdqa xmm8, xmm4
408 psrld xmm8, 12
409 pslld xmm4, 20
410 por xmm4, xmm8
411 movdqa xmm8, xmm5
412 psrld xmm8, 12
413 pslld xmm5, 20
414 por xmm5, xmm8
415 movdqa xmm8, xmm6
416 psrld xmm8, 12
417 pslld xmm6, 20
418 por xmm6, xmm8
419 movdqa xmm8, xmm7
420 psrld xmm8, 12
421 pslld xmm7, 20
422 por xmm7, xmm8
423 paddd xmm0, xmmword ptr [rsp+0x60]
424 paddd xmm1, xmmword ptr [rsp+0xA0]
425 paddd xmm2, xmmword ptr [rsp]
426 paddd xmm3, xmmword ptr [rsp+0xD0]
427 paddd xmm0, xmm4
428 paddd xmm1, xmm5
429 paddd xmm2, xmm6
430 paddd xmm3, xmm7
431 pxor xmm12, xmm0
432 pxor xmm13, xmm1
433 pxor xmm14, xmm2
434 pxor xmm15, xmm3
435 movdqa xmm8, xmmword ptr [ROT8+rip]
436 pshufb xmm12, xmm8
437 pshufb xmm13, xmm8
438 pshufb xmm14, xmm8
439 pshufb xmm15, xmm8
440 movdqa xmm8, xmmword ptr [rsp+0x100]
441 paddd xmm8, xmm12
442 paddd xmm9, xmm13
443 paddd xmm10, xmm14
444 paddd xmm11, xmm15
445 pxor xmm4, xmm8
446 pxor xmm5, xmm9
447 pxor xmm6, xmm10
448 pxor xmm7, xmm11
449 movdqa xmmword ptr [rsp+0x100], xmm8
450 movdqa xmm8, xmm4
451 psrld xmm8, 7
452 pslld xmm4, 25
453 por xmm4, xmm8
454 movdqa xmm8, xmm5
455 psrld xmm8, 7
456 pslld xmm5, 25
457 por xmm5, xmm8
458 movdqa xmm8, xmm6
459 psrld xmm8, 7
460 pslld xmm6, 25
461 por xmm6, xmm8
462 movdqa xmm8, xmm7
463 psrld xmm8, 7
464 pslld xmm7, 25
465 por xmm7, xmm8
466 paddd xmm0, xmmword ptr [rsp+0x10]
467 paddd xmm1, xmmword ptr [rsp+0xC0]
468 paddd xmm2, xmmword ptr [rsp+0x90]
469 paddd xmm3, xmmword ptr [rsp+0xF0]
470 paddd xmm0, xmm5
471 paddd xmm1, xmm6
472 paddd xmm2, xmm7
473 paddd xmm3, xmm4
474 pxor xmm15, xmm0
475 pxor xmm12, xmm1
476 pxor xmm13, xmm2
477 pxor xmm14, xmm3
478 movdqa xmm8, xmmword ptr [ROT16+rip]
479 pshufb xmm15, xmm8
480 pshufb xmm12, xmm8
481 pshufb xmm13, xmm8
482 pshufb xmm14, xmm8
483 paddd xmm10, xmm15
484 paddd xmm11, xmm12
485 movdqa xmm8, xmmword ptr [rsp+0x100]
486 paddd xmm8, xmm13
487 paddd xmm9, xmm14
488 pxor xmm5, xmm10
489 pxor xmm6, xmm11
490 pxor xmm7, xmm8
491 pxor xmm4, xmm9
492 movdqa xmmword ptr [rsp+0x100], xmm8
493 movdqa xmm8, xmm5
494 psrld xmm8, 12
495 pslld xmm5, 20
496 por xmm5, xmm8
497 movdqa xmm8, xmm6
498 psrld xmm8, 12
499 pslld xmm6, 20
500 por xmm6, xmm8
501 movdqa xmm8, xmm7
502 psrld xmm8, 12
503 pslld xmm7, 20
504 por xmm7, xmm8
505 movdqa xmm8, xmm4
506 psrld xmm8, 12
507 pslld xmm4, 20
508 por xmm4, xmm8
509 paddd xmm0, xmmword ptr [rsp+0xB0]
510 paddd xmm1, xmmword ptr [rsp+0x50]
511 paddd xmm2, xmmword ptr [rsp+0xE0]
512 paddd xmm3, xmmword ptr [rsp+0x80]
513 paddd xmm0, xmm5
514 paddd xmm1, xmm6
515 paddd xmm2, xmm7
516 paddd xmm3, xmm4
517 pxor xmm15, xmm0
518 pxor xmm12, xmm1
519 pxor xmm13, xmm2
520 pxor xmm14, xmm3
521 movdqa xmm8, xmmword ptr [ROT8+rip]
522 pshufb xmm15, xmm8
523 pshufb xmm12, xmm8
524 pshufb xmm13, xmm8
525 pshufb xmm14, xmm8
526 paddd xmm10, xmm15
527 paddd xmm11, xmm12
528 movdqa xmm8, xmmword ptr [rsp+0x100]
529 paddd xmm8, xmm13
530 paddd xmm9, xmm14
531 pxor xmm5, xmm10
532 pxor xmm6, xmm11
533 pxor xmm7, xmm8
534 pxor xmm4, xmm9
535 movdqa xmmword ptr [rsp+0x100], xmm8
536 movdqa xmm8, xmm5
537 psrld xmm8, 7
538 pslld xmm5, 25
539 por xmm5, xmm8
540 movdqa xmm8, xmm6
541 psrld xmm8, 7
542 pslld xmm6, 25
543 por xmm6, xmm8
544 movdqa xmm8, xmm7
545 psrld xmm8, 7
546 pslld xmm7, 25
547 por xmm7, xmm8
548 movdqa xmm8, xmm4
549 psrld xmm8, 7
550 pslld xmm4, 25
551 por xmm4, xmm8
552 paddd xmm0, xmmword ptr [rsp+0x30]
553 paddd xmm1, xmmword ptr [rsp+0xA0]
554 paddd xmm2, xmmword ptr [rsp+0xD0]
555 paddd xmm3, xmmword ptr [rsp+0x70]
556 paddd xmm0, xmm4
557 paddd xmm1, xmm5
558 paddd xmm2, xmm6
559 paddd xmm3, xmm7
560 pxor xmm12, xmm0
561 pxor xmm13, xmm1
562 pxor xmm14, xmm2
563 pxor xmm15, xmm3
564 movdqa xmm8, xmmword ptr [ROT16+rip]
565 pshufb xmm12, xmm8
566 pshufb xmm13, xmm8
567 pshufb xmm14, xmm8
568 pshufb xmm15, xmm8
569 movdqa xmm8, xmmword ptr [rsp+0x100]
570 paddd xmm8, xmm12
571 paddd xmm9, xmm13
572 paddd xmm10, xmm14
573 paddd xmm11, xmm15
574 pxor xmm4, xmm8
575 pxor xmm5, xmm9
576 pxor xmm6, xmm10
577 pxor xmm7, xmm11
578 movdqa xmmword ptr [rsp+0x100], xmm8
579 movdqa xmm8, xmm4
580 psrld xmm8, 12
581 pslld xmm4, 20
582 por xmm4, xmm8
583 movdqa xmm8, xmm5
584 psrld xmm8, 12
585 pslld xmm5, 20
586 por xmm5, xmm8
587 movdqa xmm8, xmm6
588 psrld xmm8, 12
589 pslld xmm6, 20
590 por xmm6, xmm8
591 movdqa xmm8, xmm7
592 psrld xmm8, 12
593 pslld xmm7, 20
594 por xmm7, xmm8
595 paddd xmm0, xmmword ptr [rsp+0x40]
596 paddd xmm1, xmmword ptr [rsp+0xC0]
597 paddd xmm2, xmmword ptr [rsp+0x20]
598 paddd xmm3, xmmword ptr [rsp+0xE0]
599 paddd xmm0, xmm4
600 paddd xmm1, xmm5
601 paddd xmm2, xmm6
602 paddd xmm3, xmm7
603 pxor xmm12, xmm0
604 pxor xmm13, xmm1
605 pxor xmm14, xmm2
606 pxor xmm15, xmm3
607 movdqa xmm8, xmmword ptr [ROT8+rip]
608 pshufb xmm12, xmm8
609 pshufb xmm13, xmm8
610 pshufb xmm14, xmm8
611 pshufb xmm15, xmm8
612 movdqa xmm8, xmmword ptr [rsp+0x100]
613 paddd xmm8, xmm12
614 paddd xmm9, xmm13
615 paddd xmm10, xmm14
616 paddd xmm11, xmm15
617 pxor xmm4, xmm8
618 pxor xmm5, xmm9
619 pxor xmm6, xmm10
620 pxor xmm7, xmm11
621 movdqa xmmword ptr [rsp+0x100], xmm8
622 movdqa xmm8, xmm4
623 psrld xmm8, 7
624 pslld xmm4, 25
625 por xmm4, xmm8
626 movdqa xmm8, xmm5
627 psrld xmm8, 7
628 pslld xmm5, 25
629 por xmm5, xmm8
630 movdqa xmm8, xmm6
631 psrld xmm8, 7
632 pslld xmm6, 25
633 por xmm6, xmm8
634 movdqa xmm8, xmm7
635 psrld xmm8, 7
636 pslld xmm7, 25
637 por xmm7, xmm8
638 paddd xmm0, xmmword ptr [rsp+0x60]
639 paddd xmm1, xmmword ptr [rsp+0x90]
640 paddd xmm2, xmmword ptr [rsp+0xB0]
641 paddd xmm3, xmmword ptr [rsp+0x80]
642 paddd xmm0, xmm5
643 paddd xmm1, xmm6
644 paddd xmm2, xmm7
645 paddd xmm3, xmm4
646 pxor xmm15, xmm0
647 pxor xmm12, xmm1
648 pxor xmm13, xmm2
649 pxor xmm14, xmm3
650 movdqa xmm8, xmmword ptr [ROT16+rip]
651 pshufb xmm15, xmm8
652 pshufb xmm12, xmm8
653 pshufb xmm13, xmm8
654 pshufb xmm14, xmm8
655 paddd xmm10, xmm15
656 paddd xmm11, xmm12
657 movdqa xmm8, xmmword ptr [rsp+0x100]
658 paddd xmm8, xmm13
659 paddd xmm9, xmm14
660 pxor xmm5, xmm10
661 pxor xmm6, xmm11
662 pxor xmm7, xmm8
663 pxor xmm4, xmm9
664 movdqa xmmword ptr [rsp+0x100], xmm8
665 movdqa xmm8, xmm5
666 psrld xmm8, 12
667 pslld xmm5, 20
668 por xmm5, xmm8
669 movdqa xmm8, xmm6
670 psrld xmm8, 12
671 pslld xmm6, 20
672 por xmm6, xmm8
673 movdqa xmm8, xmm7
674 psrld xmm8, 12
675 pslld xmm7, 20
676 por xmm7, xmm8
677 movdqa xmm8, xmm4
678 psrld xmm8, 12
679 pslld xmm4, 20
680 por xmm4, xmm8
681 paddd xmm0, xmmword ptr [rsp+0x50]
682 paddd xmm1, xmmword ptr [rsp]
683 paddd xmm2, xmmword ptr [rsp+0xF0]
684 paddd xmm3, xmmword ptr [rsp+0x10]
685 paddd xmm0, xmm5
686 paddd xmm1, xmm6
687 paddd xmm2, xmm7
688 paddd xmm3, xmm4
689 pxor xmm15, xmm0
690 pxor xmm12, xmm1
691 pxor xmm13, xmm2
692 pxor xmm14, xmm3
693 movdqa xmm8, xmmword ptr [ROT8+rip]
694 pshufb xmm15, xmm8
695 pshufb xmm12, xmm8
696 pshufb xmm13, xmm8
697 pshufb xmm14, xmm8
698 paddd xmm10, xmm15
699 paddd xmm11, xmm12
700 movdqa xmm8, xmmword ptr [rsp+0x100]
701 paddd xmm8, xmm13
702 paddd xmm9, xmm14
703 pxor xmm5, xmm10
704 pxor xmm6, xmm11
705 pxor xmm7, xmm8
706 pxor xmm4, xmm9
707 movdqa xmmword ptr [rsp+0x100], xmm8
708 movdqa xmm8, xmm5
709 psrld xmm8, 7
710 pslld xmm5, 25
711 por xmm5, xmm8
712 movdqa xmm8, xmm6
713 psrld xmm8, 7
714 pslld xmm6, 25
715 por xmm6, xmm8
716 movdqa xmm8, xmm7
717 psrld xmm8, 7
718 pslld xmm7, 25
719 por xmm7, xmm8
720 movdqa xmm8, xmm4
721 psrld xmm8, 7
722 pslld xmm4, 25
723 por xmm4, xmm8
724 paddd xmm0, xmmword ptr [rsp+0xA0]
725 paddd xmm1, xmmword ptr [rsp+0xC0]
726 paddd xmm2, xmmword ptr [rsp+0xE0]
727 paddd xmm3, xmmword ptr [rsp+0xD0]
728 paddd xmm0, xmm4
729 paddd xmm1, xmm5
730 paddd xmm2, xmm6
731 paddd xmm3, xmm7
732 pxor xmm12, xmm0
733 pxor xmm13, xmm1
734 pxor xmm14, xmm2
735 pxor xmm15, xmm3
736 movdqa xmm8, xmmword ptr [ROT16+rip]
737 pshufb xmm12, xmm8
738 pshufb xmm13, xmm8
739 pshufb xmm14, xmm8
740 pshufb xmm15, xmm8
741 movdqa xmm8, xmmword ptr [rsp+0x100]
742 paddd xmm8, xmm12
743 paddd xmm9, xmm13
744 paddd xmm10, xmm14
745 paddd xmm11, xmm15
746 pxor xmm4, xmm8
747 pxor xmm5, xmm9
748 pxor xmm6, xmm10
749 pxor xmm7, xmm11
750 movdqa xmmword ptr [rsp+0x100], xmm8
751 movdqa xmm8, xmm4
752 psrld xmm8, 12
753 pslld xmm4, 20
754 por xmm4, xmm8
755 movdqa xmm8, xmm5
756 psrld xmm8, 12
757 pslld xmm5, 20
758 por xmm5, xmm8
759 movdqa xmm8, xmm6
760 psrld xmm8, 12
761 pslld xmm6, 20
762 por xmm6, xmm8
763 movdqa xmm8, xmm7
764 psrld xmm8, 12
765 pslld xmm7, 20
766 por xmm7, xmm8
767 paddd xmm0, xmmword ptr [rsp+0x70]
768 paddd xmm1, xmmword ptr [rsp+0x90]
769 paddd xmm2, xmmword ptr [rsp+0x30]
770 paddd xmm3, xmmword ptr [rsp+0xF0]
771 paddd xmm0, xmm4
772 paddd xmm1, xmm5
773 paddd xmm2, xmm6
774 paddd xmm3, xmm7
775 pxor xmm12, xmm0
776 pxor xmm13, xmm1
777 pxor xmm14, xmm2
778 pxor xmm15, xmm3
779 movdqa xmm8, xmmword ptr [ROT8+rip]
780 pshufb xmm12, xmm8
781 pshufb xmm13, xmm8
782 pshufb xmm14, xmm8
783 pshufb xmm15, xmm8
784 movdqa xmm8, xmmword ptr [rsp+0x100]
785 paddd xmm8, xmm12
786 paddd xmm9, xmm13
787 paddd xmm10, xmm14
788 paddd xmm11, xmm15
789 pxor xmm4, xmm8
790 pxor xmm5, xmm9
791 pxor xmm6, xmm10
792 pxor xmm7, xmm11
793 movdqa xmmword ptr [rsp+0x100], xmm8
794 movdqa xmm8, xmm4
795 psrld xmm8, 7
796 pslld xmm4, 25
797 por xmm4, xmm8
798 movdqa xmm8, xmm5
799 psrld xmm8, 7
800 pslld xmm5, 25
801 por xmm5, xmm8
802 movdqa xmm8, xmm6
803 psrld xmm8, 7
804 pslld xmm6, 25
805 por xmm6, xmm8
806 movdqa xmm8, xmm7
807 psrld xmm8, 7
808 pslld xmm7, 25
809 por xmm7, xmm8
810 paddd xmm0, xmmword ptr [rsp+0x40]
811 paddd xmm1, xmmword ptr [rsp+0xB0]
812 paddd xmm2, xmmword ptr [rsp+0x50]
813 paddd xmm3, xmmword ptr [rsp+0x10]
814 paddd xmm0, xmm5
815 paddd xmm1, xmm6
816 paddd xmm2, xmm7
817 paddd xmm3, xmm4
818 pxor xmm15, xmm0
819 pxor xmm12, xmm1
820 pxor xmm13, xmm2
821 pxor xmm14, xmm3
822 movdqa xmm8, xmmword ptr [ROT16+rip]
823 pshufb xmm15, xmm8
824 pshufb xmm12, xmm8
825 pshufb xmm13, xmm8
826 pshufb xmm14, xmm8
827 paddd xmm10, xmm15
828 paddd xmm11, xmm12
829 movdqa xmm8, xmmword ptr [rsp+0x100]
830 paddd xmm8, xmm13
831 paddd xmm9, xmm14
832 pxor xmm5, xmm10
833 pxor xmm6, xmm11
834 pxor xmm7, xmm8
835 pxor xmm4, xmm9
836 movdqa xmmword ptr [rsp+0x100], xmm8
837 movdqa xmm8, xmm5
838 psrld xmm8, 12
839 pslld xmm5, 20
840 por xmm5, xmm8
841 movdqa xmm8, xmm6
842 psrld xmm8, 12
843 pslld xmm6, 20
844 por xmm6, xmm8
845 movdqa xmm8, xmm7
846 psrld xmm8, 12
847 pslld xmm7, 20
848 por xmm7, xmm8
849 movdqa xmm8, xmm4
850 psrld xmm8, 12
851 pslld xmm4, 20
852 por xmm4, xmm8
853 paddd xmm0, xmmword ptr [rsp]
854 paddd xmm1, xmmword ptr [rsp+0x20]
855 paddd xmm2, xmmword ptr [rsp+0x80]
856 paddd xmm3, xmmword ptr [rsp+0x60]
857 paddd xmm0, xmm5
858 paddd xmm1, xmm6
859 paddd xmm2, xmm7
860 paddd xmm3, xmm4
861 pxor xmm15, xmm0
862 pxor xmm12, xmm1
863 pxor xmm13, xmm2
864 pxor xmm14, xmm3
865 movdqa xmm8, xmmword ptr [ROT8+rip]
866 pshufb xmm15, xmm8
867 pshufb xmm12, xmm8
868 pshufb xmm13, xmm8
869 pshufb xmm14, xmm8
870 paddd xmm10, xmm15
871 paddd xmm11, xmm12
872 movdqa xmm8, xmmword ptr [rsp+0x100]
873 paddd xmm8, xmm13
874 paddd xmm9, xmm14
875 pxor xmm5, xmm10
876 pxor xmm6, xmm11
877 pxor xmm7, xmm8
878 pxor xmm4, xmm9
879 movdqa xmmword ptr [rsp+0x100], xmm8
880 movdqa xmm8, xmm5
881 psrld xmm8, 7
882 pslld xmm5, 25
883 por xmm5, xmm8
884 movdqa xmm8, xmm6
885 psrld xmm8, 7
886 pslld xmm6, 25
887 por xmm6, xmm8
888 movdqa xmm8, xmm7
889 psrld xmm8, 7
890 pslld xmm7, 25
891 por xmm7, xmm8
892 movdqa xmm8, xmm4
893 psrld xmm8, 7
894 pslld xmm4, 25
895 por xmm4, xmm8
896 paddd xmm0, xmmword ptr [rsp+0xC0]
897 paddd xmm1, xmmword ptr [rsp+0x90]
898 paddd xmm2, xmmword ptr [rsp+0xF0]
899 paddd xmm3, xmmword ptr [rsp+0xE0]
900 paddd xmm0, xmm4
901 paddd xmm1, xmm5
902 paddd xmm2, xmm6
903 paddd xmm3, xmm7
904 pxor xmm12, xmm0
905 pxor xmm13, xmm1
906 pxor xmm14, xmm2
907 pxor xmm15, xmm3
908 movdqa xmm8, xmmword ptr [ROT16+rip]
909 pshufb xmm12, xmm8
910 pshufb xmm13, xmm8
911 pshufb xmm14, xmm8
912 pshufb xmm15, xmm8
913 movdqa xmm8, xmmword ptr [rsp+0x100]
914 paddd xmm8, xmm12
915 paddd xmm9, xmm13
916 paddd xmm10, xmm14
917 paddd xmm11, xmm15
918 pxor xmm4, xmm8
919 pxor xmm5, xmm9
920 pxor xmm6, xmm10
921 pxor xmm7, xmm11
922 movdqa xmmword ptr [rsp+0x100], xmm8
923 movdqa xmm8, xmm4
924 psrld xmm8, 12
925 pslld xmm4, 20
926 por xmm4, xmm8
927 movdqa xmm8, xmm5
928 psrld xmm8, 12
929 pslld xmm5, 20
930 por xmm5, xmm8
931 movdqa xmm8, xmm6
932 psrld xmm8, 12
933 pslld xmm6, 20
934 por xmm6, xmm8
935 movdqa xmm8, xmm7
936 psrld xmm8, 12
937 pslld xmm7, 20
938 por xmm7, xmm8
939 paddd xmm0, xmmword ptr [rsp+0xD0]
940 paddd xmm1, xmmword ptr [rsp+0xB0]
941 paddd xmm2, xmmword ptr [rsp+0xA0]
942 paddd xmm3, xmmword ptr [rsp+0x80]
943 paddd xmm0, xmm4
944 paddd xmm1, xmm5
945 paddd xmm2, xmm6
946 paddd xmm3, xmm7
947 pxor xmm12, xmm0
948 pxor xmm13, xmm1
949 pxor xmm14, xmm2
950 pxor xmm15, xmm3
951 movdqa xmm8, xmmword ptr [ROT8+rip]
952 pshufb xmm12, xmm8
953 pshufb xmm13, xmm8
954 pshufb xmm14, xmm8
955 pshufb xmm15, xmm8
956 movdqa xmm8, xmmword ptr [rsp+0x100]
957 paddd xmm8, xmm12
958 paddd xmm9, xmm13
959 paddd xmm10, xmm14
960 paddd xmm11, xmm15
961 pxor xmm4, xmm8
962 pxor xmm5, xmm9
963 pxor xmm6, xmm10
964 pxor xmm7, xmm11
965 movdqa xmmword ptr [rsp+0x100], xmm8
966 movdqa xmm8, xmm4
967 psrld xmm8, 7
968 pslld xmm4, 25
969 por xmm4, xmm8
970 movdqa xmm8, xmm5
971 psrld xmm8, 7
972 pslld xmm5, 25
973 por xmm5, xmm8
974 movdqa xmm8, xmm6
975 psrld xmm8, 7
976 pslld xmm6, 25
977 por xmm6, xmm8
978 movdqa xmm8, xmm7
979 psrld xmm8, 7
980 pslld xmm7, 25
981 por xmm7, xmm8
982 paddd xmm0, xmmword ptr [rsp+0x70]
983 paddd xmm1, xmmword ptr [rsp+0x50]
984 paddd xmm2, xmmword ptr [rsp]
985 paddd xmm3, xmmword ptr [rsp+0x60]
986 paddd xmm0, xmm5
987 paddd xmm1, xmm6
988 paddd xmm2, xmm7
989 paddd xmm3, xmm4
990 pxor xmm15, xmm0
991 pxor xmm12, xmm1
992 pxor xmm13, xmm2
993 pxor xmm14, xmm3
994 movdqa xmm8, xmmword ptr [ROT16+rip]
995 pshufb xmm15, xmm8
996 pshufb xmm12, xmm8
997 pshufb xmm13, xmm8
998 pshufb xmm14, xmm8
999 paddd xmm10, xmm15
1000 paddd xmm11, xmm12
1001 movdqa xmm8, xmmword ptr [rsp+0x100]
1002 paddd xmm8, xmm13
1003 paddd xmm9, xmm14
1004 pxor xmm5, xmm10
1005 pxor xmm6, xmm11
1006 pxor xmm7, xmm8
1007 pxor xmm4, xmm9
1008 movdqa xmmword ptr [rsp+0x100], xmm8
1009 movdqa xmm8, xmm5
1010 psrld xmm8, 12
1011 pslld xmm5, 20
1012 por xmm5, xmm8
1013 movdqa xmm8, xmm6
1014 psrld xmm8, 12
1015 pslld xmm6, 20
1016 por xmm6, xmm8
1017 movdqa xmm8, xmm7
1018 psrld xmm8, 12
1019 pslld xmm7, 20
1020 por xmm7, xmm8
1021 movdqa xmm8, xmm4
1022 psrld xmm8, 12
1023 pslld xmm4, 20
1024 por xmm4, xmm8
1025 paddd xmm0, xmmword ptr [rsp+0x20]
1026 paddd xmm1, xmmword ptr [rsp+0x30]
1027 paddd xmm2, xmmword ptr [rsp+0x10]
1028 paddd xmm3, xmmword ptr [rsp+0x40]
1029 paddd xmm0, xmm5
1030 paddd xmm1, xmm6
1031 paddd xmm2, xmm7
1032 paddd xmm3, xmm4
1033 pxor xmm15, xmm0
1034 pxor xmm12, xmm1
1035 pxor xmm13, xmm2
1036 pxor xmm14, xmm3
1037 movdqa xmm8, xmmword ptr [ROT8+rip]
1038 pshufb xmm15, xmm8
1039 pshufb xmm12, xmm8
1040 pshufb xmm13, xmm8
1041 pshufb xmm14, xmm8
1042 paddd xmm10, xmm15
1043 paddd xmm11, xmm12
1044 movdqa xmm8, xmmword ptr [rsp+0x100]
1045 paddd xmm8, xmm13
1046 paddd xmm9, xmm14
1047 pxor xmm5, xmm10
1048 pxor xmm6, xmm11
1049 pxor xmm7, xmm8
1050 pxor xmm4, xmm9
1051 movdqa xmmword ptr [rsp+0x100], xmm8
1052 movdqa xmm8, xmm5
1053 psrld xmm8, 7
1054 pslld xmm5, 25
1055 por xmm5, xmm8
1056 movdqa xmm8, xmm6
1057 psrld xmm8, 7
1058 pslld xmm6, 25
1059 por xmm6, xmm8
1060 movdqa xmm8, xmm7
1061 psrld xmm8, 7
1062 pslld xmm7, 25
1063 por xmm7, xmm8
1064 movdqa xmm8, xmm4
1065 psrld xmm8, 7
1066 pslld xmm4, 25
1067 por xmm4, xmm8
1068 paddd xmm0, xmmword ptr [rsp+0x90]
1069 paddd xmm1, xmmword ptr [rsp+0xB0]
1070 paddd xmm2, xmmword ptr [rsp+0x80]
1071 paddd xmm3, xmmword ptr [rsp+0xF0]
1072 paddd xmm0, xmm4
1073 paddd xmm1, xmm5
1074 paddd xmm2, xmm6
1075 paddd xmm3, xmm7
1076 pxor xmm12, xmm0
1077 pxor xmm13, xmm1
1078 pxor xmm14, xmm2
1079 pxor xmm15, xmm3
1080 movdqa xmm8, xmmword ptr [ROT16+rip]
1081 pshufb xmm12, xmm8
1082 pshufb xmm13, xmm8
1083 pshufb xmm14, xmm8
1084 pshufb xmm15, xmm8
1085 movdqa xmm8, xmmword ptr [rsp+0x100]
1086 paddd xmm8, xmm12
1087 paddd xmm9, xmm13
1088 paddd xmm10, xmm14
1089 paddd xmm11, xmm15
1090 pxor xmm4, xmm8
1091 pxor xmm5, xmm9
1092 pxor xmm6, xmm10
1093 pxor xmm7, xmm11
1094 movdqa xmmword ptr [rsp+0x100], xmm8
1095 movdqa xmm8, xmm4
1096 psrld xmm8, 12
1097 pslld xmm4, 20
1098 por xmm4, xmm8
1099 movdqa xmm8, xmm5
1100 psrld xmm8, 12
1101 pslld xmm5, 20
1102 por xmm5, xmm8
1103 movdqa xmm8, xmm6
1104 psrld xmm8, 12
1105 pslld xmm6, 20
1106 por xmm6, xmm8
1107 movdqa xmm8, xmm7
1108 psrld xmm8, 12
1109 pslld xmm7, 20
1110 por xmm7, xmm8
1111 paddd xmm0, xmmword ptr [rsp+0xE0]
1112 paddd xmm1, xmmword ptr [rsp+0x50]
1113 paddd xmm2, xmmword ptr [rsp+0xC0]
1114 paddd xmm3, xmmword ptr [rsp+0x10]
1115 paddd xmm0, xmm4
1116 paddd xmm1, xmm5
1117 paddd xmm2, xmm6
1118 paddd xmm3, xmm7
1119 pxor xmm12, xmm0
1120 pxor xmm13, xmm1
1121 pxor xmm14, xmm2
1122 pxor xmm15, xmm3
1123 movdqa xmm8, xmmword ptr [ROT8+rip]
1124 pshufb xmm12, xmm8
1125 pshufb xmm13, xmm8
1126 pshufb xmm14, xmm8
1127 pshufb xmm15, xmm8
1128 movdqa xmm8, xmmword ptr [rsp+0x100]
1129 paddd xmm8, xmm12
1130 paddd xmm9, xmm13
1131 paddd xmm10, xmm14
1132 paddd xmm11, xmm15
1133 pxor xmm4, xmm8
1134 pxor xmm5, xmm9
1135 pxor xmm6, xmm10
1136 pxor xmm7, xmm11
1137 movdqa xmmword ptr [rsp+0x100], xmm8
1138 movdqa xmm8, xmm4
1139 psrld xmm8, 7
1140 pslld xmm4, 25
1141 por xmm4, xmm8
1142 movdqa xmm8, xmm5
1143 psrld xmm8, 7
1144 pslld xmm5, 25
1145 por xmm5, xmm8
1146 movdqa xmm8, xmm6
1147 psrld xmm8, 7
1148 pslld xmm6, 25
1149 por xmm6, xmm8
1150 movdqa xmm8, xmm7
1151 psrld xmm8, 7
1152 pslld xmm7, 25
1153 por xmm7, xmm8
1154 paddd xmm0, xmmword ptr [rsp+0xD0]
1155 paddd xmm1, xmmword ptr [rsp]
1156 paddd xmm2, xmmword ptr [rsp+0x20]
1157 paddd xmm3, xmmword ptr [rsp+0x40]
1158 paddd xmm0, xmm5
1159 paddd xmm1, xmm6
1160 paddd xmm2, xmm7
1161 paddd xmm3, xmm4
1162 pxor xmm15, xmm0
1163 pxor xmm12, xmm1
1164 pxor xmm13, xmm2
1165 pxor xmm14, xmm3
1166 movdqa xmm8, xmmword ptr [ROT16+rip]
1167 pshufb xmm15, xmm8
1168 pshufb xmm12, xmm8
1169 pshufb xmm13, xmm8
1170 pshufb xmm14, xmm8
1171 paddd xmm10, xmm15
1172 paddd xmm11, xmm12
1173 movdqa xmm8, xmmword ptr [rsp+0x100]
1174 paddd xmm8, xmm13
1175 paddd xmm9, xmm14
1176 pxor xmm5, xmm10
1177 pxor xmm6, xmm11
1178 pxor xmm7, xmm8
1179 pxor xmm4, xmm9
1180 movdqa xmmword ptr [rsp+0x100], xmm8
1181 movdqa xmm8, xmm5
1182 psrld xmm8, 12
1183 pslld xmm5, 20
1184 por xmm5, xmm8
1185 movdqa xmm8, xmm6
1186 psrld xmm8, 12
1187 pslld xmm6, 20
1188 por xmm6, xmm8
1189 movdqa xmm8, xmm7
1190 psrld xmm8, 12
1191 pslld xmm7, 20
1192 por xmm7, xmm8
1193 movdqa xmm8, xmm4
1194 psrld xmm8, 12
1195 pslld xmm4, 20
1196 por xmm4, xmm8
1197 paddd xmm0, xmmword ptr [rsp+0x30]
1198 paddd xmm1, xmmword ptr [rsp+0xA0]
1199 paddd xmm2, xmmword ptr [rsp+0x60]
1200 paddd xmm3, xmmword ptr [rsp+0x70]
1201 paddd xmm0, xmm5
1202 paddd xmm1, xmm6
1203 paddd xmm2, xmm7
1204 paddd xmm3, xmm4
1205 pxor xmm15, xmm0
1206 pxor xmm12, xmm1
1207 pxor xmm13, xmm2
1208 pxor xmm14, xmm3
1209 movdqa xmm8, xmmword ptr [ROT8+rip]
1210 pshufb xmm15, xmm8
1211 pshufb xmm12, xmm8
1212 pshufb xmm13, xmm8
1213 pshufb xmm14, xmm8
1214 paddd xmm10, xmm15
1215 paddd xmm11, xmm12
1216 movdqa xmm8, xmmword ptr [rsp+0x100]
1217 paddd xmm8, xmm13
1218 paddd xmm9, xmm14
1219 pxor xmm5, xmm10
1220 pxor xmm6, xmm11
1221 pxor xmm7, xmm8
1222 pxor xmm4, xmm9
1223 movdqa xmmword ptr [rsp+0x100], xmm8
1224 movdqa xmm8, xmm5
1225 psrld xmm8, 7
1226 pslld xmm5, 25
1227 por xmm5, xmm8
1228 movdqa xmm8, xmm6
1229 psrld xmm8, 7
1230 pslld xmm6, 25
1231 por xmm6, xmm8
1232 movdqa xmm8, xmm7
1233 psrld xmm8, 7
1234 pslld xmm7, 25
1235 por xmm7, xmm8
1236 movdqa xmm8, xmm4
1237 psrld xmm8, 7
1238 pslld xmm4, 25
1239 por xmm4, xmm8
1240 paddd xmm0, xmmword ptr [rsp+0xB0]
1241 paddd xmm1, xmmword ptr [rsp+0x50]
1242 paddd xmm2, xmmword ptr [rsp+0x10]
1243 paddd xmm3, xmmword ptr [rsp+0x80]
1244 paddd xmm0, xmm4
1245 paddd xmm1, xmm5
1246 paddd xmm2, xmm6
1247 paddd xmm3, xmm7
1248 pxor xmm12, xmm0
1249 pxor xmm13, xmm1
1250 pxor xmm14, xmm2
1251 pxor xmm15, xmm3
1252 movdqa xmm8, xmmword ptr [ROT16+rip]
1253 pshufb xmm12, xmm8
1254 pshufb xmm13, xmm8
1255 pshufb xmm14, xmm8
1256 pshufb xmm15, xmm8
1257 movdqa xmm8, xmmword ptr [rsp+0x100]
1258 paddd xmm8, xmm12
1259 paddd xmm9, xmm13
1260 paddd xmm10, xmm14
1261 paddd xmm11, xmm15
1262 pxor xmm4, xmm8
1263 pxor xmm5, xmm9
1264 pxor xmm6, xmm10
1265 pxor xmm7, xmm11
1266 movdqa xmmword ptr [rsp+0x100], xmm8
1267 movdqa xmm8, xmm4
1268 psrld xmm8, 12
1269 pslld xmm4, 20
1270 por xmm4, xmm8
1271 movdqa xmm8, xmm5
1272 psrld xmm8, 12
1273 pslld xmm5, 20
1274 por xmm5, xmm8
1275 movdqa xmm8, xmm6
1276 psrld xmm8, 12
1277 pslld xmm6, 20
1278 por xmm6, xmm8
1279 movdqa xmm8, xmm7
1280 psrld xmm8, 12
1281 pslld xmm7, 20
1282 por xmm7, xmm8
1283 paddd xmm0, xmmword ptr [rsp+0xF0]
1284 paddd xmm1, xmmword ptr [rsp]
1285 paddd xmm2, xmmword ptr [rsp+0x90]
1286 paddd xmm3, xmmword ptr [rsp+0x60]
1287 paddd xmm0, xmm4
1288 paddd xmm1, xmm5
1289 paddd xmm2, xmm6
1290 paddd xmm3, xmm7
1291 pxor xmm12, xmm0
1292 pxor xmm13, xmm1
1293 pxor xmm14, xmm2
1294 pxor xmm15, xmm3
1295 movdqa xmm8, xmmword ptr [ROT8+rip]
1296 pshufb xmm12, xmm8
1297 pshufb xmm13, xmm8
1298 pshufb xmm14, xmm8
1299 pshufb xmm15, xmm8
1300 movdqa xmm8, xmmword ptr [rsp+0x100]
1301 paddd xmm8, xmm12
1302 paddd xmm9, xmm13
1303 paddd xmm10, xmm14
1304 paddd xmm11, xmm15
1305 pxor xmm4, xmm8
1306 pxor xmm5, xmm9
1307 pxor xmm6, xmm10
1308 pxor xmm7, xmm11
1309 movdqa xmmword ptr [rsp+0x100], xmm8
1310 movdqa xmm8, xmm4
1311 psrld xmm8, 7
1312 pslld xmm4, 25
1313 por xmm4, xmm8
1314 movdqa xmm8, xmm5
1315 psrld xmm8, 7
1316 pslld xmm5, 25
1317 por xmm5, xmm8
1318 movdqa xmm8, xmm6
1319 psrld xmm8, 7
1320 pslld xmm6, 25
1321 por xmm6, xmm8
1322 movdqa xmm8, xmm7
1323 psrld xmm8, 7
1324 pslld xmm7, 25
1325 por xmm7, xmm8
1326 paddd xmm0, xmmword ptr [rsp+0xE0]
1327 paddd xmm1, xmmword ptr [rsp+0x20]
1328 paddd xmm2, xmmword ptr [rsp+0x30]
1329 paddd xmm3, xmmword ptr [rsp+0x70]
1330 paddd xmm0, xmm5
1331 paddd xmm1, xmm6
1332 paddd xmm2, xmm7
1333 paddd xmm3, xmm4
1334 pxor xmm15, xmm0
1335 pxor xmm12, xmm1
1336 pxor xmm13, xmm2
1337 pxor xmm14, xmm3
1338 movdqa xmm8, xmmword ptr [ROT16+rip]
1339 pshufb xmm15, xmm8
1340 pshufb xmm12, xmm8
1341 pshufb xmm13, xmm8
1342 pshufb xmm14, xmm8
1343 paddd xmm10, xmm15
1344 paddd xmm11, xmm12
1345 movdqa xmm8, xmmword ptr [rsp+0x100]
1346 paddd xmm8, xmm13
1347 paddd xmm9, xmm14
1348 pxor xmm5, xmm10
1349 pxor xmm6, xmm11
1350 pxor xmm7, xmm8
1351 pxor xmm4, xmm9
1352 movdqa xmmword ptr [rsp+0x100], xmm8
1353 movdqa xmm8, xmm5
1354 psrld xmm8, 12
1355 pslld xmm5, 20
1356 por xmm5, xmm8
1357 movdqa xmm8, xmm6
1358 psrld xmm8, 12
1359 pslld xmm6, 20
1360 por xmm6, xmm8
1361 movdqa xmm8, xmm7
1362 psrld xmm8, 12
1363 pslld xmm7, 20
1364 por xmm7, xmm8
1365 movdqa xmm8, xmm4
1366 psrld xmm8, 12
1367 pslld xmm4, 20
1368 por xmm4, xmm8
1369 paddd xmm0, xmmword ptr [rsp+0xA0]
1370 paddd xmm1, xmmword ptr [rsp+0xC0]
1371 paddd xmm2, xmmword ptr [rsp+0x40]
1372 paddd xmm3, xmmword ptr [rsp+0xD0]
1373 paddd xmm0, xmm5
1374 paddd xmm1, xmm6
1375 paddd xmm2, xmm7
1376 paddd xmm3, xmm4
1377 pxor xmm15, xmm0
1378 pxor xmm12, xmm1
1379 pxor xmm13, xmm2
1380 pxor xmm14, xmm3
1381 movdqa xmm8, xmmword ptr [ROT8+rip]
1382 pshufb xmm15, xmm8
1383 pshufb xmm12, xmm8
1384 pshufb xmm13, xmm8
1385 pshufb xmm14, xmm8
1386 paddd xmm10, xmm15
1387 paddd xmm11, xmm12
1388 movdqa xmm8, xmmword ptr [rsp+0x100]
1389 paddd xmm8, xmm13
1390 paddd xmm9, xmm14
1391 pxor xmm5, xmm10
1392 pxor xmm6, xmm11
1393 pxor xmm7, xmm8
1394 pxor xmm4, xmm9
1395 pxor xmm0, xmm8
1396 pxor xmm1, xmm9
1397 pxor xmm2, xmm10
1398 pxor xmm3, xmm11
1399 movdqa xmm8, xmm5
1400 psrld xmm8, 7
1401 pslld xmm5, 25
1402 por xmm5, xmm8
1403 movdqa xmm8, xmm6
1404 psrld xmm8, 7
1405 pslld xmm6, 25
1406 por xmm6, xmm8
1407 movdqa xmm8, xmm7
1408 psrld xmm8, 7
1409 pslld xmm7, 25
1410 por xmm7, xmm8
1411 movdqa xmm8, xmm4
1412 psrld xmm8, 7
1413 pslld xmm4, 25
1414 por xmm4, xmm8
1415 pxor xmm4, xmm12
1416 pxor xmm5, xmm13
1417 pxor xmm6, xmm14
1418 pxor xmm7, xmm15
1419 mov eax, r13d
1420 jne 9b
1421 movdqa xmm9, xmm0
1422 punpckldq xmm0, xmm1
1423 punpckhdq xmm9, xmm1
1424 movdqa xmm11, xmm2
1425 punpckldq xmm2, xmm3
1426 punpckhdq xmm11, xmm3
1427 movdqa xmm1, xmm0
1428 punpcklqdq xmm0, xmm2
1429 punpckhqdq xmm1, xmm2
1430 movdqa xmm3, xmm9
1431 punpcklqdq xmm9, xmm11
1432 punpckhqdq xmm3, xmm11
1433 movdqu xmmword ptr [rbx], xmm0
1434 movdqu xmmword ptr [rbx+0x20], xmm1
1435 movdqu xmmword ptr [rbx+0x40], xmm9
1436 movdqu xmmword ptr [rbx+0x60], xmm3
1437 movdqa xmm9, xmm4
1438 punpckldq xmm4, xmm5
1439 punpckhdq xmm9, xmm5
1440 movdqa xmm11, xmm6
1441 punpckldq xmm6, xmm7
1442 punpckhdq xmm11, xmm7
1443 movdqa xmm5, xmm4
1444 punpcklqdq xmm4, xmm6
1445 punpckhqdq xmm5, xmm6
1446 movdqa xmm7, xmm9
1447 punpcklqdq xmm9, xmm11
1448 punpckhqdq xmm7, xmm11
1449 movdqu xmmword ptr [rbx+0x10], xmm4
1450 movdqu xmmword ptr [rbx+0x30], xmm5
1451 movdqu xmmword ptr [rbx+0x50], xmm9
1452 movdqu xmmword ptr [rbx+0x70], xmm7
1453 movdqa xmm1, xmmword ptr [rsp+0x110]
1454 movdqa xmm0, xmm1
1455 paddd xmm1, xmmword ptr [rsp+0x150]
1456 movdqa xmmword ptr [rsp+0x110], xmm1
1457 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1458 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1459 pcmpgtd xmm0, xmm1
1460 movdqa xmm1, xmmword ptr [rsp+0x120]
1461 psubd xmm1, xmm0
1462 movdqa xmmword ptr [rsp+0x120], xmm1
1463 add rbx, 128
1464 add rdi, 32
1465 sub rsi, 4
1466 cmp rsi, 4
1467 jnc 2b
1468 test rsi, rsi
1469 jnz 3f
1470 4:
1471 mov rsp, rbp
1472 pop rbp
1473 pop rbx
1474 pop r12
1475 pop r13
1476 pop r14
1477 pop r15
1478 ret
1479 .p2align 5
1480 3:
1481 test esi, 0x2
1482 je 3f
1483 movups xmm0, xmmword ptr [rcx]
1484 movups xmm1, xmmword ptr [rcx+0x10]
1485 movaps xmm8, xmm0
1486 movaps xmm9, xmm1
1487 movd xmm13, dword ptr [rsp+0x110]
1488 pinsrd xmm13, dword ptr [rsp+0x120], 1
1489 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1490 movaps xmmword ptr [rsp], xmm13
1491 movd xmm14, dword ptr [rsp+0x114]
1492 pinsrd xmm14, dword ptr [rsp+0x124], 1
1493 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1494 movaps xmmword ptr [rsp+0x10], xmm14
1495 mov r8, qword ptr [rdi]
1496 mov r9, qword ptr [rdi+0x8]
1497 movzx eax, byte ptr [rbp+0x40]
1498 or eax, r13d
1499 xor edx, edx
1500 2:
1501 mov r14d, eax
1502 or eax, r12d
1503 add rdx, 64
1504 cmp rdx, r15
1505 cmovne eax, r14d
1506 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1507 movaps xmm10, xmm2
1508 movups xmm4, xmmword ptr [r8+rdx-0x40]
1509 movups xmm5, xmmword ptr [r8+rdx-0x30]
1510 movaps xmm3, xmm4
1511 shufps xmm4, xmm5, 136
1512 shufps xmm3, xmm5, 221
1513 movaps xmm5, xmm3
1514 movups xmm6, xmmword ptr [r8+rdx-0x20]
1515 movups xmm7, xmmword ptr [r8+rdx-0x10]
1516 movaps xmm3, xmm6
1517 shufps xmm6, xmm7, 136
1518 pshufd xmm6, xmm6, 0x93
1519 shufps xmm3, xmm7, 221
1520 pshufd xmm7, xmm3, 0x93
1521 movups xmm12, xmmword ptr [r9+rdx-0x40]
1522 movups xmm13, xmmword ptr [r9+rdx-0x30]
1523 movaps xmm11, xmm12
1524 shufps xmm12, xmm13, 136
1525 shufps xmm11, xmm13, 221
1526 movaps xmm13, xmm11
1527 movups xmm14, xmmword ptr [r9+rdx-0x20]
1528 movups xmm15, xmmword ptr [r9+rdx-0x10]
1529 movaps xmm11, xmm14
1530 shufps xmm14, xmm15, 136
1531 pshufd xmm14, xmm14, 0x93
1532 shufps xmm11, xmm15, 221
1533 pshufd xmm15, xmm11, 0x93
1534 movaps xmm3, xmmword ptr [rsp]
1535 movaps xmm11, xmmword ptr [rsp+0x10]
1536 pinsrd xmm3, eax, 3
1537 pinsrd xmm11, eax, 3
1538 mov al, 7
1539 9:
1540 paddd xmm0, xmm4
1541 paddd xmm8, xmm12
1542 movaps xmmword ptr [rsp+0x20], xmm4
1543 movaps xmmword ptr [rsp+0x30], xmm12
1544 paddd xmm0, xmm1
1545 paddd xmm8, xmm9
1546 pxor xmm3, xmm0
1547 pxor xmm11, xmm8
1548 movaps xmm12, xmmword ptr [ROT16+rip]
1549 pshufb xmm3, xmm12
1550 pshufb xmm11, xmm12
1551 paddd xmm2, xmm3
1552 paddd xmm10, xmm11
1553 pxor xmm1, xmm2
1554 pxor xmm9, xmm10
1555 movdqa xmm4, xmm1
1556 pslld xmm1, 20
1557 psrld xmm4, 12
1558 por xmm1, xmm4
1559 movdqa xmm4, xmm9
1560 pslld xmm9, 20
1561 psrld xmm4, 12
1562 por xmm9, xmm4
1563 paddd xmm0, xmm5
1564 paddd xmm8, xmm13
1565 movaps xmmword ptr [rsp+0x40], xmm5
1566 movaps xmmword ptr [rsp+0x50], xmm13
1567 paddd xmm0, xmm1
1568 paddd xmm8, xmm9
1569 pxor xmm3, xmm0
1570 pxor xmm11, xmm8
1571 movaps xmm13, xmmword ptr [ROT8+rip]
1572 pshufb xmm3, xmm13
1573 pshufb xmm11, xmm13
1574 paddd xmm2, xmm3
1575 paddd xmm10, xmm11
1576 pxor xmm1, xmm2
1577 pxor xmm9, xmm10
1578 movdqa xmm4, xmm1
1579 pslld xmm1, 25
1580 psrld xmm4, 7
1581 por xmm1, xmm4
1582 movdqa xmm4, xmm9
1583 pslld xmm9, 25
1584 psrld xmm4, 7
1585 por xmm9, xmm4
1586 pshufd xmm0, xmm0, 0x93
1587 pshufd xmm8, xmm8, 0x93
1588 pshufd xmm3, xmm3, 0x4E
1589 pshufd xmm11, xmm11, 0x4E
1590 pshufd xmm2, xmm2, 0x39
1591 pshufd xmm10, xmm10, 0x39
1592 paddd xmm0, xmm6
1593 paddd xmm8, xmm14
1594 paddd xmm0, xmm1
1595 paddd xmm8, xmm9
1596 pxor xmm3, xmm0
1597 pxor xmm11, xmm8
1598 pshufb xmm3, xmm12
1599 pshufb xmm11, xmm12
1600 paddd xmm2, xmm3
1601 paddd xmm10, xmm11
1602 pxor xmm1, xmm2
1603 pxor xmm9, xmm10
1604 movdqa xmm4, xmm1
1605 pslld xmm1, 20
1606 psrld xmm4, 12
1607 por xmm1, xmm4
1608 movdqa xmm4, xmm9
1609 pslld xmm9, 20
1610 psrld xmm4, 12
1611 por xmm9, xmm4
1612 paddd xmm0, xmm7
1613 paddd xmm8, xmm15
1614 paddd xmm0, xmm1
1615 paddd xmm8, xmm9
1616 pxor xmm3, xmm0
1617 pxor xmm11, xmm8
1618 pshufb xmm3, xmm13
1619 pshufb xmm11, xmm13
1620 paddd xmm2, xmm3
1621 paddd xmm10, xmm11
1622 pxor xmm1, xmm2
1623 pxor xmm9, xmm10
1624 movdqa xmm4, xmm1
1625 pslld xmm1, 25
1626 psrld xmm4, 7
1627 por xmm1, xmm4
1628 movdqa xmm4, xmm9
1629 pslld xmm9, 25
1630 psrld xmm4, 7
1631 por xmm9, xmm4
1632 pshufd xmm0, xmm0, 0x39
1633 pshufd xmm8, xmm8, 0x39
1634 pshufd xmm3, xmm3, 0x4E
1635 pshufd xmm11, xmm11, 0x4E
1636 pshufd xmm2, xmm2, 0x93
1637 pshufd xmm10, xmm10, 0x93
1638 dec al
1639 je 9f
1640 movdqa xmm12, xmmword ptr [rsp+0x20]
1641 movdqa xmm5, xmmword ptr [rsp+0x40]
1642 pshufd xmm13, xmm12, 0x0F
1643 shufps xmm12, xmm5, 214
1644 pshufd xmm4, xmm12, 0x39
1645 movdqa xmm12, xmm6
1646 shufps xmm12, xmm7, 250
1647 pblendw xmm13, xmm12, 0xCC
1648 movdqa xmm12, xmm7
1649 punpcklqdq xmm12, xmm5
1650 pblendw xmm12, xmm6, 0xC0
1651 pshufd xmm12, xmm12, 0x78
1652 punpckhdq xmm5, xmm7
1653 punpckldq xmm6, xmm5
1654 pshufd xmm7, xmm6, 0x1E
1655 movdqa xmmword ptr [rsp+0x20], xmm13
1656 movdqa xmmword ptr [rsp+0x40], xmm12
1657 movdqa xmm5, xmmword ptr [rsp+0x30]
1658 movdqa xmm13, xmmword ptr [rsp+0x50]
1659 pshufd xmm6, xmm5, 0x0F
1660 shufps xmm5, xmm13, 214
1661 pshufd xmm12, xmm5, 0x39
1662 movdqa xmm5, xmm14
1663 shufps xmm5, xmm15, 250
1664 pblendw xmm6, xmm5, 0xCC
1665 movdqa xmm5, xmm15
1666 punpcklqdq xmm5, xmm13
1667 pblendw xmm5, xmm14, 0xC0
1668 pshufd xmm5, xmm5, 0x78
1669 punpckhdq xmm13, xmm15
1670 punpckldq xmm14, xmm13
1671 pshufd xmm15, xmm14, 0x1E
1672 movdqa xmm13, xmm6
1673 movdqa xmm14, xmm5
1674 movdqa xmm5, xmmword ptr [rsp+0x20]
1675 movdqa xmm6, xmmword ptr [rsp+0x40]
1676 jmp 9b
1677 9:
1678 pxor xmm0, xmm2
1679 pxor xmm1, xmm3
1680 pxor xmm8, xmm10
1681 pxor xmm9, xmm11
1682 mov eax, r13d
1683 cmp rdx, r15
1684 jne 2b
1685 movups xmmword ptr [rbx], xmm0
1686 movups xmmword ptr [rbx+0x10], xmm1
1687 movups xmmword ptr [rbx+0x20], xmm8
1688 movups xmmword ptr [rbx+0x30], xmm9
1689 movdqa xmm0, xmmword ptr [rsp+0x130]
1690 movdqa xmm1, xmmword ptr [rsp+0x110]
1691 movdqa xmm2, xmmword ptr [rsp+0x120]
1692 movdqu xmm3, xmmword ptr [rsp+0x118]
1693 movdqu xmm4, xmmword ptr [rsp+0x128]
1694 blendvps xmm1, xmm3, xmm0
1695 blendvps xmm2, xmm4, xmm0
1696 movdqa xmmword ptr [rsp+0x110], xmm1
1697 movdqa xmmword ptr [rsp+0x120], xmm2
1698 add rdi, 16
1699 add rbx, 64
1700 sub rsi, 2
1701 3:
1702 test esi, 0x1
1703 je 4b
1704 movups xmm0, xmmword ptr [rcx]
1705 movups xmm1, xmmword ptr [rcx+0x10]
1706 movd xmm13, dword ptr [rsp+0x110]
1707 pinsrd xmm13, dword ptr [rsp+0x120], 1
1708 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1709 movaps xmm14, xmmword ptr [ROT8+rip]
1710 movaps xmm15, xmmword ptr [ROT16+rip]
1711 mov r8, qword ptr [rdi]
1712 movzx eax, byte ptr [rbp+0x40]
1713 or eax, r13d
1714 xor edx, edx
1715 2:
1716 mov r14d, eax
1717 or eax, r12d
1718 add rdx, 64
1719 cmp rdx, r15
1720 cmovne eax, r14d
1721 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1722 movaps xmm3, xmm13
1723 pinsrd xmm3, eax, 3
1724 movups xmm4, xmmword ptr [r8+rdx-0x40]
1725 movups xmm5, xmmword ptr [r8+rdx-0x30]
1726 movaps xmm8, xmm4
1727 shufps xmm4, xmm5, 136
1728 shufps xmm8, xmm5, 221
1729 movaps xmm5, xmm8
1730 movups xmm6, xmmword ptr [r8+rdx-0x20]
1731 movups xmm7, xmmword ptr [r8+rdx-0x10]
1732 movaps xmm8, xmm6
1733 shufps xmm6, xmm7, 136
1734 pshufd xmm6, xmm6, 0x93
1735 shufps xmm8, xmm7, 221
1736 pshufd xmm7, xmm8, 0x93
1737 mov al, 7
1738 9:
1739 paddd xmm0, xmm4
1740 paddd xmm0, xmm1
1741 pxor xmm3, xmm0
1742 pshufb xmm3, xmm15
1743 paddd xmm2, xmm3
1744 pxor xmm1, xmm2
1745 movdqa xmm11, xmm1
1746 pslld xmm1, 20
1747 psrld xmm11, 12
1748 por xmm1, xmm11
1749 paddd xmm0, xmm5
1750 paddd xmm0, xmm1
1751 pxor xmm3, xmm0
1752 pshufb xmm3, xmm14
1753 paddd xmm2, xmm3
1754 pxor xmm1, xmm2
1755 movdqa xmm11, xmm1
1756 pslld xmm1, 25
1757 psrld xmm11, 7
1758 por xmm1, xmm11
1759 pshufd xmm0, xmm0, 0x93
1760 pshufd xmm3, xmm3, 0x4E
1761 pshufd xmm2, xmm2, 0x39
1762 paddd xmm0, xmm6
1763 paddd xmm0, xmm1
1764 pxor xmm3, xmm0
1765 pshufb xmm3, xmm15
1766 paddd xmm2, xmm3
1767 pxor xmm1, xmm2
1768 movdqa xmm11, xmm1
1769 pslld xmm1, 20
1770 psrld xmm11, 12
1771 por xmm1, xmm11
1772 paddd xmm0, xmm7
1773 paddd xmm0, xmm1
1774 pxor xmm3, xmm0
1775 pshufb xmm3, xmm14
1776 paddd xmm2, xmm3
1777 pxor xmm1, xmm2
1778 movdqa xmm11, xmm1
1779 pslld xmm1, 25
1780 psrld xmm11, 7
1781 por xmm1, xmm11
1782 pshufd xmm0, xmm0, 0x39
1783 pshufd xmm3, xmm3, 0x4E
1784 pshufd xmm2, xmm2, 0x93
1785 dec al
1786 jz 9f
1787 movdqa xmm8, xmm4
1788 shufps xmm8, xmm5, 214
1789 pshufd xmm9, xmm4, 0x0F
1790 pshufd xmm4, xmm8, 0x39
1791 movdqa xmm8, xmm6
1792 shufps xmm8, xmm7, 250
1793 pblendw xmm9, xmm8, 0xCC
1794 movdqa xmm8, xmm7
1795 punpcklqdq xmm8, xmm5
1796 pblendw xmm8, xmm6, 0xC0
1797 pshufd xmm8, xmm8, 0x78
1798 punpckhdq xmm5, xmm7
1799 punpckldq xmm6, xmm5
1800 pshufd xmm7, xmm6, 0x1E
1801 movdqa xmm5, xmm9
1802 movdqa xmm6, xmm8
1803 jmp 9b
1804 9:
1805 pxor xmm0, xmm2
1806 pxor xmm1, xmm3
1807 mov eax, r13d
1808 cmp rdx, r15
1809 jne 2b
1810 movups xmmword ptr [rbx], xmm0
1811 movups xmmword ptr [rbx+0x10], xmm1
1812 jmp 4b
1813 .p2align 6
1814 zfs_blake3_compress_in_place_sse41:
1815 _CET_ENDBR
1816 movups xmm0, xmmword ptr [rdi]
1817 movups xmm1, xmmword ptr [rdi+0x10]
1818 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1819 shl r8, 32
1820 add rdx, r8
1821 movq xmm3, rcx
1822 movq xmm4, rdx
1823 punpcklqdq xmm3, xmm4
1824 movups xmm4, xmmword ptr [rsi]
1825 movups xmm5, xmmword ptr [rsi+0x10]
1826 movaps xmm8, xmm4
1827 shufps xmm4, xmm5, 136
1828 shufps xmm8, xmm5, 221
1829 movaps xmm5, xmm8
1830 movups xmm6, xmmword ptr [rsi+0x20]
1831 movups xmm7, xmmword ptr [rsi+0x30]
1832 movaps xmm8, xmm6
1833 shufps xmm6, xmm7, 136
1834 pshufd xmm6, xmm6, 0x93
1835 shufps xmm8, xmm7, 221
1836 pshufd xmm7, xmm8, 0x93
1837 movaps xmm14, xmmword ptr [ROT8+rip]
1838 movaps xmm15, xmmword ptr [ROT16+rip]
1839 mov al, 7
1840 9:
1841 paddd xmm0, xmm4
1842 paddd xmm0, xmm1
1843 pxor xmm3, xmm0
1844 pshufb xmm3, xmm15
1845 paddd xmm2, xmm3
1846 pxor xmm1, xmm2
1847 movdqa xmm11, xmm1
1848 pslld xmm1, 20
1849 psrld xmm11, 12
1850 por xmm1, xmm11
1851 paddd xmm0, xmm5
1852 paddd xmm0, xmm1
1853 pxor xmm3, xmm0
1854 pshufb xmm3, xmm14
1855 paddd xmm2, xmm3
1856 pxor xmm1, xmm2
1857 movdqa xmm11, xmm1
1858 pslld xmm1, 25
1859 psrld xmm11, 7
1860 por xmm1, xmm11
1861 pshufd xmm0, xmm0, 0x93
1862 pshufd xmm3, xmm3, 0x4E
1863 pshufd xmm2, xmm2, 0x39
1864 paddd xmm0, xmm6
1865 paddd xmm0, xmm1
1866 pxor xmm3, xmm0
1867 pshufb xmm3, xmm15
1868 paddd xmm2, xmm3
1869 pxor xmm1, xmm2
1870 movdqa xmm11, xmm1
1871 pslld xmm1, 20
1872 psrld xmm11, 12
1873 por xmm1, xmm11
1874 paddd xmm0, xmm7
1875 paddd xmm0, xmm1
1876 pxor xmm3, xmm0
1877 pshufb xmm3, xmm14
1878 paddd xmm2, xmm3
1879 pxor xmm1, xmm2
1880 movdqa xmm11, xmm1
1881 pslld xmm1, 25
1882 psrld xmm11, 7
1883 por xmm1, xmm11
1884 pshufd xmm0, xmm0, 0x39
1885 pshufd xmm3, xmm3, 0x4E
1886 pshufd xmm2, xmm2, 0x93
1887 dec al
1888 jz 9f
1889 movdqa xmm8, xmm4
1890 shufps xmm8, xmm5, 214
1891 pshufd xmm9, xmm4, 0x0F
1892 pshufd xmm4, xmm8, 0x39
1893 movdqa xmm8, xmm6
1894 shufps xmm8, xmm7, 250
1895 pblendw xmm9, xmm8, 0xCC
1896 movdqa xmm8, xmm7
1897 punpcklqdq xmm8, xmm5
1898 pblendw xmm8, xmm6, 0xC0
1899 pshufd xmm8, xmm8, 0x78
1900 punpckhdq xmm5, xmm7
1901 punpckldq xmm6, xmm5
1902 pshufd xmm7, xmm6, 0x1E
1903 movdqa xmm5, xmm9
1904 movdqa xmm6, xmm8
1905 jmp 9b
1906 9:
1907 pxor xmm0, xmm2
1908 pxor xmm1, xmm3
1909 movups xmmword ptr [rdi], xmm0
1910 movups xmmword ptr [rdi+0x10], xmm1
1911 ret
1912 .p2align 6
1913 zfs_blake3_compress_xof_sse41:
1914 _CET_ENDBR
1915 movups xmm0, xmmword ptr [rdi]
1916 movups xmm1, xmmword ptr [rdi+0x10]
1917 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1918 movzx eax, r8b
1919 movzx edx, dl
1920 shl rax, 32
1921 add rdx, rax
1922 movq xmm3, rcx
1923 movq xmm4, rdx
1924 punpcklqdq xmm3, xmm4
1925 movups xmm4, xmmword ptr [rsi]
1926 movups xmm5, xmmword ptr [rsi+0x10]
1927 movaps xmm8, xmm4
1928 shufps xmm4, xmm5, 136
1929 shufps xmm8, xmm5, 221
1930 movaps xmm5, xmm8
1931 movups xmm6, xmmword ptr [rsi+0x20]
1932 movups xmm7, xmmword ptr [rsi+0x30]
1933 movaps xmm8, xmm6
1934 shufps xmm6, xmm7, 136
1935 pshufd xmm6, xmm6, 0x93
1936 shufps xmm8, xmm7, 221
1937 pshufd xmm7, xmm8, 0x93
1938 movaps xmm14, xmmword ptr [ROT8+rip]
1939 movaps xmm15, xmmword ptr [ROT16+rip]
1940 mov al, 7
1941 9:
1942 paddd xmm0, xmm4
1943 paddd xmm0, xmm1
1944 pxor xmm3, xmm0
1945 pshufb xmm3, xmm15
1946 paddd xmm2, xmm3
1947 pxor xmm1, xmm2
1948 movdqa xmm11, xmm1
1949 pslld xmm1, 20
1950 psrld xmm11, 12
1951 por xmm1, xmm11
1952 paddd xmm0, xmm5
1953 paddd xmm0, xmm1
1954 pxor xmm3, xmm0
1955 pshufb xmm3, xmm14
1956 paddd xmm2, xmm3
1957 pxor xmm1, xmm2
1958 movdqa xmm11, xmm1
1959 pslld xmm1, 25
1960 psrld xmm11, 7
1961 por xmm1, xmm11
1962 pshufd xmm0, xmm0, 0x93
1963 pshufd xmm3, xmm3, 0x4E
1964 pshufd xmm2, xmm2, 0x39
1965 paddd xmm0, xmm6
1966 paddd xmm0, xmm1
1967 pxor xmm3, xmm0
1968 pshufb xmm3, xmm15
1969 paddd xmm2, xmm3
1970 pxor xmm1, xmm2
1971 movdqa xmm11, xmm1
1972 pslld xmm1, 20
1973 psrld xmm11, 12
1974 por xmm1, xmm11
1975 paddd xmm0, xmm7
1976 paddd xmm0, xmm1
1977 pxor xmm3, xmm0
1978 pshufb xmm3, xmm14
1979 paddd xmm2, xmm3
1980 pxor xmm1, xmm2
1981 movdqa xmm11, xmm1
1982 pslld xmm1, 25
1983 psrld xmm11, 7
1984 por xmm1, xmm11
1985 pshufd xmm0, xmm0, 0x39
1986 pshufd xmm3, xmm3, 0x4E
1987 pshufd xmm2, xmm2, 0x93
1988 dec al
1989 jz 9f
1990 movdqa xmm8, xmm4
1991 shufps xmm8, xmm5, 214
1992 pshufd xmm9, xmm4, 0x0F
1993 pshufd xmm4, xmm8, 0x39
1994 movdqa xmm8, xmm6
1995 shufps xmm8, xmm7, 250
1996 pblendw xmm9, xmm8, 0xCC
1997 movdqa xmm8, xmm7
1998 punpcklqdq xmm8, xmm5
1999 pblendw xmm8, xmm6, 0xC0
2000 pshufd xmm8, xmm8, 0x78
2001 punpckhdq xmm5, xmm7
2002 punpckldq xmm6, xmm5
2003 pshufd xmm7, xmm6, 0x1E
2004 movdqa xmm5, xmm9
2005 movdqa xmm6, xmm8
2006 jmp 9b
2007 9:
2008 movdqu xmm4, xmmword ptr [rdi]
2009 movdqu xmm5, xmmword ptr [rdi+0x10]
2010 pxor xmm0, xmm2
2011 pxor xmm1, xmm3
2012 pxor xmm2, xmm4
2013 pxor xmm3, xmm5
2014 movups xmmword ptr [r9], xmm0
2015 movups xmmword ptr [r9+0x10], xmm1
2016 movups xmmword ptr [r9+0x20], xmm2
2017 movups xmmword ptr [r9+0x30], xmm3
2018 ret
2019
2020 .size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
2021 .size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
2022 .size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
2023
2024 #ifdef __APPLE__
2025 .static_data
2026 #else
2027 .section .rodata
2028 #endif
2029 .p2align 6
2030 BLAKE3_IV:
2031 .long 0x6A09E667, 0xBB67AE85
2032 .long 0x3C6EF372, 0xA54FF53A
2033 ROT16:
2034 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2035 ROT8:
2036 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2037 ADD0:
2038 .long 0, 1, 2, 3
2039 ADD1:
2040 .long 4, 4, 4, 4
2041 BLAKE3_IV_0:
2042 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2043 BLAKE3_IV_1:
2044 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2045 BLAKE3_IV_2:
2046 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2047 BLAKE3_IV_3:
2048 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2049 BLAKE3_BLOCK_LEN:
2050 .long 64, 64, 64, 64
2051 CMP_MSB_MASK:
2052 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2053
2054 #endif /* HAVE_SSE4_1 */
2055
2056 #ifdef __ELF__
2057 .section .note.GNU-stack,"",%progbits
2058 #endif