4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2020 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
28 #if defined(HAVE_SSE4_1)
31 #include <sys/asm_linkage.h>
33 #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
34 #if __has_include(<cet.h>)
39 #if !defined(_CET_ENDBR)
43 .intel_syntax noprefix
44 .global zfs_blake3_compress_in_place_sse41
45 .global zfs_blake3_compress_xof_sse41
46 .global zfs_blake3_hash_many_sse41
49 .type zfs_blake3_hash_many_sse41,@function
50 .type zfs_blake3_compress_in_place_sse41,@function
51 .type zfs_blake3_compress_xof_sse41,@function
54 zfs_blake3_hash_many_sse41:
64 and rsp, 0xFFFFFFFFFFFFFFC0
67 pshufd xmm0, xmm0, 0x00
68 movdqa xmmword ptr [rsp+0x130], xmm0
70 pand xmm1, xmmword ptr [ADD0+rip]
71 pand xmm0, xmmword ptr [ADD1+rip]
72 movdqa xmmword ptr [rsp+0x150], xmm0
74 pshufd xmm0, xmm0, 0x00
76 movdqa xmmword ptr [rsp+0x110], xmm0
77 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
78 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
82 pshufd xmm2, xmm2, 0x00
84 movdqa xmmword ptr [rsp+0x120], xmm2
85 mov rbx, qword ptr [rbp+0x50]
88 movzx r13d, byte ptr [rbp+0x38]
89 movzx r12d, byte ptr [rbp+0x48]
93 movdqu xmm3, xmmword ptr [rcx]
94 pshufd xmm0, xmm3, 0x00
95 pshufd xmm1, xmm3, 0x55
96 pshufd xmm2, xmm3, 0xAA
97 pshufd xmm3, xmm3, 0xFF
98 movdqu xmm7, xmmword ptr [rcx+0x10]
99 pshufd xmm4, xmm7, 0x00
100 pshufd xmm5, xmm7, 0x55
101 pshufd xmm6, xmm7, 0xAA
102 pshufd xmm7, xmm7, 0xFF
103 mov r8, qword ptr [rdi]
104 mov r9, qword ptr [rdi+0x8]
105 mov r10, qword ptr [rdi+0x10]
106 mov r11, qword ptr [rdi+0x18]
107 movzx eax, byte ptr [rbp+0x40]
116 movdqu xmm8, xmmword ptr [r8+rdx-0x40]
117 movdqu xmm9, xmmword ptr [r9+rdx-0x40]
118 movdqu xmm10, xmmword ptr [r10+rdx-0x40]
119 movdqu xmm11, xmmword ptr [r11+rdx-0x40]
122 punpckhdq xmm12, xmm9
124 punpckldq xmm10, xmm11
125 punpckhdq xmm14, xmm11
127 punpcklqdq xmm8, xmm10
128 punpckhqdq xmm9, xmm10
130 punpcklqdq xmm12, xmm14
131 punpckhqdq xmm13, xmm14
132 movdqa xmmword ptr [rsp], xmm8
133 movdqa xmmword ptr [rsp+0x10], xmm9
134 movdqa xmmword ptr [rsp+0x20], xmm12
135 movdqa xmmword ptr [rsp+0x30], xmm13
136 movdqu xmm8, xmmword ptr [r8+rdx-0x30]
137 movdqu xmm9, xmmword ptr [r9+rdx-0x30]
138 movdqu xmm10, xmmword ptr [r10+rdx-0x30]
139 movdqu xmm11, xmmword ptr [r11+rdx-0x30]
142 punpckhdq xmm12, xmm9
144 punpckldq xmm10, xmm11
145 punpckhdq xmm14, xmm11
147 punpcklqdq xmm8, xmm10
148 punpckhqdq xmm9, xmm10
150 punpcklqdq xmm12, xmm14
151 punpckhqdq xmm13, xmm14
152 movdqa xmmword ptr [rsp+0x40], xmm8
153 movdqa xmmword ptr [rsp+0x50], xmm9
154 movdqa xmmword ptr [rsp+0x60], xmm12
155 movdqa xmmword ptr [rsp+0x70], xmm13
156 movdqu xmm8, xmmword ptr [r8+rdx-0x20]
157 movdqu xmm9, xmmword ptr [r9+rdx-0x20]
158 movdqu xmm10, xmmword ptr [r10+rdx-0x20]
159 movdqu xmm11, xmmword ptr [r11+rdx-0x20]
162 punpckhdq xmm12, xmm9
164 punpckldq xmm10, xmm11
165 punpckhdq xmm14, xmm11
167 punpcklqdq xmm8, xmm10
168 punpckhqdq xmm9, xmm10
170 punpcklqdq xmm12, xmm14
171 punpckhqdq xmm13, xmm14
172 movdqa xmmword ptr [rsp+0x80], xmm8
173 movdqa xmmword ptr [rsp+0x90], xmm9
174 movdqa xmmword ptr [rsp+0xA0], xmm12
175 movdqa xmmword ptr [rsp+0xB0], xmm13
176 movdqu xmm8, xmmword ptr [r8+rdx-0x10]
177 movdqu xmm9, xmmword ptr [r9+rdx-0x10]
178 movdqu xmm10, xmmword ptr [r10+rdx-0x10]
179 movdqu xmm11, xmmword ptr [r11+rdx-0x10]
182 punpckhdq xmm12, xmm9
184 punpckldq xmm10, xmm11
185 punpckhdq xmm14, xmm11
187 punpcklqdq xmm8, xmm10
188 punpckhqdq xmm9, xmm10
190 punpcklqdq xmm12, xmm14
191 punpckhqdq xmm13, xmm14
192 movdqa xmmword ptr [rsp+0xC0], xmm8
193 movdqa xmmword ptr [rsp+0xD0], xmm9
194 movdqa xmmword ptr [rsp+0xE0], xmm12
195 movdqa xmmword ptr [rsp+0xF0], xmm13
196 movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
197 movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
198 movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
199 movdqa xmm12, xmmword ptr [rsp+0x110]
200 movdqa xmm13, xmmword ptr [rsp+0x120]
201 movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
203 pshufd xmm15, xmm15, 0x00
204 prefetcht0 [r8+rdx+0x80]
205 prefetcht0 [r9+rdx+0x80]
206 prefetcht0 [r10+rdx+0x80]
207 prefetcht0 [r11+rdx+0x80]
208 paddd xmm0, xmmword ptr [rsp]
209 paddd xmm1, xmmword ptr [rsp+0x20]
210 paddd xmm2, xmmword ptr [rsp+0x40]
211 paddd xmm3, xmmword ptr [rsp+0x60]
220 movdqa xmm8, xmmword ptr [ROT16+rip]
225 movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
234 movdqa xmmword ptr [rsp+0x100], xmm8
251 paddd xmm0, xmmword ptr [rsp+0x10]
252 paddd xmm1, xmmword ptr [rsp+0x30]
253 paddd xmm2, xmmword ptr [rsp+0x50]
254 paddd xmm3, xmmword ptr [rsp+0x70]
263 movdqa xmm8, xmmword ptr [ROT8+rip]
268 movdqa xmm8, xmmword ptr [rsp+0x100]
277 movdqa xmmword ptr [rsp+0x100], xmm8
294 paddd xmm0, xmmword ptr [rsp+0x80]
295 paddd xmm1, xmmword ptr [rsp+0xA0]
296 paddd xmm2, xmmword ptr [rsp+0xC0]
297 paddd xmm3, xmmword ptr [rsp+0xE0]
306 movdqa xmm8, xmmword ptr [ROT16+rip]
313 movdqa xmm8, xmmword ptr [rsp+0x100]
320 movdqa xmmword ptr [rsp+0x100], xmm8
337 paddd xmm0, xmmword ptr [rsp+0x90]
338 paddd xmm1, xmmword ptr [rsp+0xB0]
339 paddd xmm2, xmmword ptr [rsp+0xD0]
340 paddd xmm3, xmmword ptr [rsp+0xF0]
349 movdqa xmm8, xmmword ptr [ROT8+rip]
356 movdqa xmm8, xmmword ptr [rsp+0x100]
363 movdqa xmmword ptr [rsp+0x100], xmm8
380 paddd xmm0, xmmword ptr [rsp+0x20]
381 paddd xmm1, xmmword ptr [rsp+0x30]
382 paddd xmm2, xmmword ptr [rsp+0x70]
383 paddd xmm3, xmmword ptr [rsp+0x40]
392 movdqa xmm8, xmmword ptr [ROT16+rip]
397 movdqa xmm8, xmmword ptr [rsp+0x100]
406 movdqa xmmword ptr [rsp+0x100], xmm8
423 paddd xmm0, xmmword ptr [rsp+0x60]
424 paddd xmm1, xmmword ptr [rsp+0xA0]
425 paddd xmm2, xmmword ptr [rsp]
426 paddd xmm3, xmmword ptr [rsp+0xD0]
435 movdqa xmm8, xmmword ptr [ROT8+rip]
440 movdqa xmm8, xmmword ptr [rsp+0x100]
449 movdqa xmmword ptr [rsp+0x100], xmm8
466 paddd xmm0, xmmword ptr [rsp+0x10]
467 paddd xmm1, xmmword ptr [rsp+0xC0]
468 paddd xmm2, xmmword ptr [rsp+0x90]
469 paddd xmm3, xmmword ptr [rsp+0xF0]
478 movdqa xmm8, xmmword ptr [ROT16+rip]
485 movdqa xmm8, xmmword ptr [rsp+0x100]
492 movdqa xmmword ptr [rsp+0x100], xmm8
509 paddd xmm0, xmmword ptr [rsp+0xB0]
510 paddd xmm1, xmmword ptr [rsp+0x50]
511 paddd xmm2, xmmword ptr [rsp+0xE0]
512 paddd xmm3, xmmword ptr [rsp+0x80]
521 movdqa xmm8, xmmword ptr [ROT8+rip]
528 movdqa xmm8, xmmword ptr [rsp+0x100]
535 movdqa xmmword ptr [rsp+0x100], xmm8
552 paddd xmm0, xmmword ptr [rsp+0x30]
553 paddd xmm1, xmmword ptr [rsp+0xA0]
554 paddd xmm2, xmmword ptr [rsp+0xD0]
555 paddd xmm3, xmmword ptr [rsp+0x70]
564 movdqa xmm8, xmmword ptr [ROT16+rip]
569 movdqa xmm8, xmmword ptr [rsp+0x100]
578 movdqa xmmword ptr [rsp+0x100], xmm8
595 paddd xmm0, xmmword ptr [rsp+0x40]
596 paddd xmm1, xmmword ptr [rsp+0xC0]
597 paddd xmm2, xmmword ptr [rsp+0x20]
598 paddd xmm3, xmmword ptr [rsp+0xE0]
607 movdqa xmm8, xmmword ptr [ROT8+rip]
612 movdqa xmm8, xmmword ptr [rsp+0x100]
621 movdqa xmmword ptr [rsp+0x100], xmm8
638 paddd xmm0, xmmword ptr [rsp+0x60]
639 paddd xmm1, xmmword ptr [rsp+0x90]
640 paddd xmm2, xmmword ptr [rsp+0xB0]
641 paddd xmm3, xmmword ptr [rsp+0x80]
650 movdqa xmm8, xmmword ptr [ROT16+rip]
657 movdqa xmm8, xmmword ptr [rsp+0x100]
664 movdqa xmmword ptr [rsp+0x100], xmm8
681 paddd xmm0, xmmword ptr [rsp+0x50]
682 paddd xmm1, xmmword ptr [rsp]
683 paddd xmm2, xmmword ptr [rsp+0xF0]
684 paddd xmm3, xmmword ptr [rsp+0x10]
693 movdqa xmm8, xmmword ptr [ROT8+rip]
700 movdqa xmm8, xmmword ptr [rsp+0x100]
707 movdqa xmmword ptr [rsp+0x100], xmm8
724 paddd xmm0, xmmword ptr [rsp+0xA0]
725 paddd xmm1, xmmword ptr [rsp+0xC0]
726 paddd xmm2, xmmword ptr [rsp+0xE0]
727 paddd xmm3, xmmword ptr [rsp+0xD0]
736 movdqa xmm8, xmmword ptr [ROT16+rip]
741 movdqa xmm8, xmmword ptr [rsp+0x100]
750 movdqa xmmword ptr [rsp+0x100], xmm8
767 paddd xmm0, xmmword ptr [rsp+0x70]
768 paddd xmm1, xmmword ptr [rsp+0x90]
769 paddd xmm2, xmmword ptr [rsp+0x30]
770 paddd xmm3, xmmword ptr [rsp+0xF0]
779 movdqa xmm8, xmmword ptr [ROT8+rip]
784 movdqa xmm8, xmmword ptr [rsp+0x100]
793 movdqa xmmword ptr [rsp+0x100], xmm8
810 paddd xmm0, xmmword ptr [rsp+0x40]
811 paddd xmm1, xmmword ptr [rsp+0xB0]
812 paddd xmm2, xmmword ptr [rsp+0x50]
813 paddd xmm3, xmmword ptr [rsp+0x10]
822 movdqa xmm8, xmmword ptr [ROT16+rip]
829 movdqa xmm8, xmmword ptr [rsp+0x100]
836 movdqa xmmword ptr [rsp+0x100], xmm8
853 paddd xmm0, xmmword ptr [rsp]
854 paddd xmm1, xmmword ptr [rsp+0x20]
855 paddd xmm2, xmmword ptr [rsp+0x80]
856 paddd xmm3, xmmword ptr [rsp+0x60]
865 movdqa xmm8, xmmword ptr [ROT8+rip]
872 movdqa xmm8, xmmword ptr [rsp+0x100]
879 movdqa xmmword ptr [rsp+0x100], xmm8
896 paddd xmm0, xmmword ptr [rsp+0xC0]
897 paddd xmm1, xmmword ptr [rsp+0x90]
898 paddd xmm2, xmmword ptr [rsp+0xF0]
899 paddd xmm3, xmmword ptr [rsp+0xE0]
908 movdqa xmm8, xmmword ptr [ROT16+rip]
913 movdqa xmm8, xmmword ptr [rsp+0x100]
922 movdqa xmmword ptr [rsp+0x100], xmm8
939 paddd xmm0, xmmword ptr [rsp+0xD0]
940 paddd xmm1, xmmword ptr [rsp+0xB0]
941 paddd xmm2, xmmword ptr [rsp+0xA0]
942 paddd xmm3, xmmword ptr [rsp+0x80]
951 movdqa xmm8, xmmword ptr [ROT8+rip]
956 movdqa xmm8, xmmword ptr [rsp+0x100]
965 movdqa xmmword ptr [rsp+0x100], xmm8
982 paddd xmm0, xmmword ptr [rsp+0x70]
983 paddd xmm1, xmmword ptr [rsp+0x50]
984 paddd xmm2, xmmword ptr [rsp]
985 paddd xmm3, xmmword ptr [rsp+0x60]
994 movdqa xmm8, xmmword ptr [ROT16+rip]
1001 movdqa xmm8, xmmword ptr [rsp+0x100]
1008 movdqa xmmword ptr [rsp+0x100], xmm8
1025 paddd xmm0, xmmword ptr [rsp+0x20]
1026 paddd xmm1, xmmword ptr [rsp+0x30]
1027 paddd xmm2, xmmword ptr [rsp+0x10]
1028 paddd xmm3, xmmword ptr [rsp+0x40]
1037 movdqa xmm8, xmmword ptr [ROT8+rip]
1044 movdqa xmm8, xmmword ptr [rsp+0x100]
1051 movdqa xmmword ptr [rsp+0x100], xmm8
1068 paddd xmm0, xmmword ptr [rsp+0x90]
1069 paddd xmm1, xmmword ptr [rsp+0xB0]
1070 paddd xmm2, xmmword ptr [rsp+0x80]
1071 paddd xmm3, xmmword ptr [rsp+0xF0]
1080 movdqa xmm8, xmmword ptr [ROT16+rip]
1085 movdqa xmm8, xmmword ptr [rsp+0x100]
1094 movdqa xmmword ptr [rsp+0x100], xmm8
1111 paddd xmm0, xmmword ptr [rsp+0xE0]
1112 paddd xmm1, xmmword ptr [rsp+0x50]
1113 paddd xmm2, xmmword ptr [rsp+0xC0]
1114 paddd xmm3, xmmword ptr [rsp+0x10]
1123 movdqa xmm8, xmmword ptr [ROT8+rip]
1128 movdqa xmm8, xmmword ptr [rsp+0x100]
1137 movdqa xmmword ptr [rsp+0x100], xmm8
1154 paddd xmm0, xmmword ptr [rsp+0xD0]
1155 paddd xmm1, xmmword ptr [rsp]
1156 paddd xmm2, xmmword ptr [rsp+0x20]
1157 paddd xmm3, xmmword ptr [rsp+0x40]
1166 movdqa xmm8, xmmword ptr [ROT16+rip]
1173 movdqa xmm8, xmmword ptr [rsp+0x100]
1180 movdqa xmmword ptr [rsp+0x100], xmm8
1197 paddd xmm0, xmmword ptr [rsp+0x30]
1198 paddd xmm1, xmmword ptr [rsp+0xA0]
1199 paddd xmm2, xmmword ptr [rsp+0x60]
1200 paddd xmm3, xmmword ptr [rsp+0x70]
1209 movdqa xmm8, xmmword ptr [ROT8+rip]
1216 movdqa xmm8, xmmword ptr [rsp+0x100]
1223 movdqa xmmword ptr [rsp+0x100], xmm8
1240 paddd xmm0, xmmword ptr [rsp+0xB0]
1241 paddd xmm1, xmmword ptr [rsp+0x50]
1242 paddd xmm2, xmmword ptr [rsp+0x10]
1243 paddd xmm3, xmmword ptr [rsp+0x80]
1252 movdqa xmm8, xmmword ptr [ROT16+rip]
1257 movdqa xmm8, xmmword ptr [rsp+0x100]
1266 movdqa xmmword ptr [rsp+0x100], xmm8
1283 paddd xmm0, xmmword ptr [rsp+0xF0]
1284 paddd xmm1, xmmword ptr [rsp]
1285 paddd xmm2, xmmword ptr [rsp+0x90]
1286 paddd xmm3, xmmword ptr [rsp+0x60]
1295 movdqa xmm8, xmmword ptr [ROT8+rip]
1300 movdqa xmm8, xmmword ptr [rsp+0x100]
1309 movdqa xmmword ptr [rsp+0x100], xmm8
1326 paddd xmm0, xmmword ptr [rsp+0xE0]
1327 paddd xmm1, xmmword ptr [rsp+0x20]
1328 paddd xmm2, xmmword ptr [rsp+0x30]
1329 paddd xmm3, xmmword ptr [rsp+0x70]
1338 movdqa xmm8, xmmword ptr [ROT16+rip]
1345 movdqa xmm8, xmmword ptr [rsp+0x100]
1352 movdqa xmmword ptr [rsp+0x100], xmm8
1369 paddd xmm0, xmmword ptr [rsp+0xA0]
1370 paddd xmm1, xmmword ptr [rsp+0xC0]
1371 paddd xmm2, xmmword ptr [rsp+0x40]
1372 paddd xmm3, xmmword ptr [rsp+0xD0]
1381 movdqa xmm8, xmmword ptr [ROT8+rip]
1388 movdqa xmm8, xmmword ptr [rsp+0x100]
1422 punpckldq xmm0, xmm1
1423 punpckhdq xmm9, xmm1
1425 punpckldq xmm2, xmm3
1426 punpckhdq xmm11, xmm3
1428 punpcklqdq xmm0, xmm2
1429 punpckhqdq xmm1, xmm2
1431 punpcklqdq xmm9, xmm11
1432 punpckhqdq xmm3, xmm11
1433 movdqu xmmword ptr [rbx], xmm0
1434 movdqu xmmword ptr [rbx+0x20], xmm1
1435 movdqu xmmword ptr [rbx+0x40], xmm9
1436 movdqu xmmword ptr [rbx+0x60], xmm3
1438 punpckldq xmm4, xmm5
1439 punpckhdq xmm9, xmm5
1441 punpckldq xmm6, xmm7
1442 punpckhdq xmm11, xmm7
1444 punpcklqdq xmm4, xmm6
1445 punpckhqdq xmm5, xmm6
1447 punpcklqdq xmm9, xmm11
1448 punpckhqdq xmm7, xmm11
1449 movdqu xmmword ptr [rbx+0x10], xmm4
1450 movdqu xmmword ptr [rbx+0x30], xmm5
1451 movdqu xmmword ptr [rbx+0x50], xmm9
1452 movdqu xmmword ptr [rbx+0x70], xmm7
1453 movdqa xmm1, xmmword ptr [rsp+0x110]
1455 paddd xmm1, xmmword ptr [rsp+0x150]
1456 movdqa xmmword ptr [rsp+0x110], xmm1
1457 pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1458 pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1460 movdqa xmm1, xmmword ptr [rsp+0x120]
1462 movdqa xmmword ptr [rsp+0x120], xmm1
1483 movups xmm0, xmmword ptr [rcx]
1484 movups xmm1, xmmword ptr [rcx+0x10]
1487 movd xmm13, dword ptr [rsp+0x110]
1488 pinsrd xmm13, dword ptr [rsp+0x120], 1
1489 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1490 movaps xmmword ptr [rsp], xmm13
1491 movd xmm14, dword ptr [rsp+0x114]
1492 pinsrd xmm14, dword ptr [rsp+0x124], 1
1493 pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1494 movaps xmmword ptr [rsp+0x10], xmm14
1495 mov r8, qword ptr [rdi]
1496 mov r9, qword ptr [rdi+0x8]
1497 movzx eax, byte ptr [rbp+0x40]
1506 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1508 movups xmm4, xmmword ptr [r8+rdx-0x40]
1509 movups xmm5, xmmword ptr [r8+rdx-0x30]
1511 shufps xmm4, xmm5, 136
1512 shufps xmm3, xmm5, 221
1514 movups xmm6, xmmword ptr [r8+rdx-0x20]
1515 movups xmm7, xmmword ptr [r8+rdx-0x10]
1517 shufps xmm6, xmm7, 136
1518 pshufd xmm6, xmm6, 0x93
1519 shufps xmm3, xmm7, 221
1520 pshufd xmm7, xmm3, 0x93
1521 movups xmm12, xmmword ptr [r9+rdx-0x40]
1522 movups xmm13, xmmword ptr [r9+rdx-0x30]
1524 shufps xmm12, xmm13, 136
1525 shufps xmm11, xmm13, 221
1527 movups xmm14, xmmword ptr [r9+rdx-0x20]
1528 movups xmm15, xmmword ptr [r9+rdx-0x10]
1530 shufps xmm14, xmm15, 136
1531 pshufd xmm14, xmm14, 0x93
1532 shufps xmm11, xmm15, 221
1533 pshufd xmm15, xmm11, 0x93
1534 movaps xmm3, xmmword ptr [rsp]
1535 movaps xmm11, xmmword ptr [rsp+0x10]
1537 pinsrd xmm11, eax, 3
1542 movaps xmmword ptr [rsp+0x20], xmm4
1543 movaps xmmword ptr [rsp+0x30], xmm12
1548 movaps xmm12, xmmword ptr [ROT16+rip]
1565 movaps xmmword ptr [rsp+0x40], xmm5
1566 movaps xmmword ptr [rsp+0x50], xmm13
1571 movaps xmm13, xmmword ptr [ROT8+rip]
1586 pshufd xmm0, xmm0, 0x93
1587 pshufd xmm8, xmm8, 0x93
1588 pshufd xmm3, xmm3, 0x4E
1589 pshufd xmm11, xmm11, 0x4E
1590 pshufd xmm2, xmm2, 0x39
1591 pshufd xmm10, xmm10, 0x39
1632 pshufd xmm0, xmm0, 0x39
1633 pshufd xmm8, xmm8, 0x39
1634 pshufd xmm3, xmm3, 0x4E
1635 pshufd xmm11, xmm11, 0x4E
1636 pshufd xmm2, xmm2, 0x93
1637 pshufd xmm10, xmm10, 0x93
1640 movdqa xmm12, xmmword ptr [rsp+0x20]
1641 movdqa xmm5, xmmword ptr [rsp+0x40]
1642 pshufd xmm13, xmm12, 0x0F
1643 shufps xmm12, xmm5, 214
1644 pshufd xmm4, xmm12, 0x39
1646 shufps xmm12, xmm7, 250
1647 pblendw xmm13, xmm12, 0xCC
1649 punpcklqdq xmm12, xmm5
1650 pblendw xmm12, xmm6, 0xC0
1651 pshufd xmm12, xmm12, 0x78
1652 punpckhdq xmm5, xmm7
1653 punpckldq xmm6, xmm5
1654 pshufd xmm7, xmm6, 0x1E
1655 movdqa xmmword ptr [rsp+0x20], xmm13
1656 movdqa xmmword ptr [rsp+0x40], xmm12
1657 movdqa xmm5, xmmword ptr [rsp+0x30]
1658 movdqa xmm13, xmmword ptr [rsp+0x50]
1659 pshufd xmm6, xmm5, 0x0F
1660 shufps xmm5, xmm13, 214
1661 pshufd xmm12, xmm5, 0x39
1663 shufps xmm5, xmm15, 250
1664 pblendw xmm6, xmm5, 0xCC
1666 punpcklqdq xmm5, xmm13
1667 pblendw xmm5, xmm14, 0xC0
1668 pshufd xmm5, xmm5, 0x78
1669 punpckhdq xmm13, xmm15
1670 punpckldq xmm14, xmm13
1671 pshufd xmm15, xmm14, 0x1E
1674 movdqa xmm5, xmmword ptr [rsp+0x20]
1675 movdqa xmm6, xmmword ptr [rsp+0x40]
1685 movups xmmword ptr [rbx], xmm0
1686 movups xmmword ptr [rbx+0x10], xmm1
1687 movups xmmword ptr [rbx+0x20], xmm8
1688 movups xmmword ptr [rbx+0x30], xmm9
1689 movdqa xmm0, xmmword ptr [rsp+0x130]
1690 movdqa xmm1, xmmword ptr [rsp+0x110]
1691 movdqa xmm2, xmmword ptr [rsp+0x120]
1692 movdqu xmm3, xmmword ptr [rsp+0x118]
1693 movdqu xmm4, xmmword ptr [rsp+0x128]
1694 blendvps xmm1, xmm3, xmm0
1695 blendvps xmm2, xmm4, xmm0
1696 movdqa xmmword ptr [rsp+0x110], xmm1
1697 movdqa xmmword ptr [rsp+0x120], xmm2
1704 movups xmm0, xmmword ptr [rcx]
1705 movups xmm1, xmmword ptr [rcx+0x10]
1706 movd xmm13, dword ptr [rsp+0x110]
1707 pinsrd xmm13, dword ptr [rsp+0x120], 1
1708 pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1709 movaps xmm14, xmmword ptr [ROT8+rip]
1710 movaps xmm15, xmmword ptr [ROT16+rip]
1711 mov r8, qword ptr [rdi]
1712 movzx eax, byte ptr [rbp+0x40]
1721 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1724 movups xmm4, xmmword ptr [r8+rdx-0x40]
1725 movups xmm5, xmmword ptr [r8+rdx-0x30]
1727 shufps xmm4, xmm5, 136
1728 shufps xmm8, xmm5, 221
1730 movups xmm6, xmmword ptr [r8+rdx-0x20]
1731 movups xmm7, xmmword ptr [r8+rdx-0x10]
1733 shufps xmm6, xmm7, 136
1734 pshufd xmm6, xmm6, 0x93
1735 shufps xmm8, xmm7, 221
1736 pshufd xmm7, xmm8, 0x93
1759 pshufd xmm0, xmm0, 0x93
1760 pshufd xmm3, xmm3, 0x4E
1761 pshufd xmm2, xmm2, 0x39
1782 pshufd xmm0, xmm0, 0x39
1783 pshufd xmm3, xmm3, 0x4E
1784 pshufd xmm2, xmm2, 0x93
1788 shufps xmm8, xmm5, 214
1789 pshufd xmm9, xmm4, 0x0F
1790 pshufd xmm4, xmm8, 0x39
1792 shufps xmm8, xmm7, 250
1793 pblendw xmm9, xmm8, 0xCC
1795 punpcklqdq xmm8, xmm5
1796 pblendw xmm8, xmm6, 0xC0
1797 pshufd xmm8, xmm8, 0x78
1798 punpckhdq xmm5, xmm7
1799 punpckldq xmm6, xmm5
1800 pshufd xmm7, xmm6, 0x1E
1810 movups xmmword ptr [rbx], xmm0
1811 movups xmmword ptr [rbx+0x10], xmm1
1814 zfs_blake3_compress_in_place_sse41:
1816 movups xmm0, xmmword ptr [rdi]
1817 movups xmm1, xmmword ptr [rdi+0x10]
1818 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1823 punpcklqdq xmm3, xmm4
1824 movups xmm4, xmmword ptr [rsi]
1825 movups xmm5, xmmword ptr [rsi+0x10]
1827 shufps xmm4, xmm5, 136
1828 shufps xmm8, xmm5, 221
1830 movups xmm6, xmmword ptr [rsi+0x20]
1831 movups xmm7, xmmword ptr [rsi+0x30]
1833 shufps xmm6, xmm7, 136
1834 pshufd xmm6, xmm6, 0x93
1835 shufps xmm8, xmm7, 221
1836 pshufd xmm7, xmm8, 0x93
1837 movaps xmm14, xmmword ptr [ROT8+rip]
1838 movaps xmm15, xmmword ptr [ROT16+rip]
1861 pshufd xmm0, xmm0, 0x93
1862 pshufd xmm3, xmm3, 0x4E
1863 pshufd xmm2, xmm2, 0x39
1884 pshufd xmm0, xmm0, 0x39
1885 pshufd xmm3, xmm3, 0x4E
1886 pshufd xmm2, xmm2, 0x93
1890 shufps xmm8, xmm5, 214
1891 pshufd xmm9, xmm4, 0x0F
1892 pshufd xmm4, xmm8, 0x39
1894 shufps xmm8, xmm7, 250
1895 pblendw xmm9, xmm8, 0xCC
1897 punpcklqdq xmm8, xmm5
1898 pblendw xmm8, xmm6, 0xC0
1899 pshufd xmm8, xmm8, 0x78
1900 punpckhdq xmm5, xmm7
1901 punpckldq xmm6, xmm5
1902 pshufd xmm7, xmm6, 0x1E
1909 movups xmmword ptr [rdi], xmm0
1910 movups xmmword ptr [rdi+0x10], xmm1
1913 zfs_blake3_compress_xof_sse41:
1915 movups xmm0, xmmword ptr [rdi]
1916 movups xmm1, xmmword ptr [rdi+0x10]
1917 movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1924 punpcklqdq xmm3, xmm4
1925 movups xmm4, xmmword ptr [rsi]
1926 movups xmm5, xmmword ptr [rsi+0x10]
1928 shufps xmm4, xmm5, 136
1929 shufps xmm8, xmm5, 221
1931 movups xmm6, xmmword ptr [rsi+0x20]
1932 movups xmm7, xmmword ptr [rsi+0x30]
1934 shufps xmm6, xmm7, 136
1935 pshufd xmm6, xmm6, 0x93
1936 shufps xmm8, xmm7, 221
1937 pshufd xmm7, xmm8, 0x93
1938 movaps xmm14, xmmword ptr [ROT8+rip]
1939 movaps xmm15, xmmword ptr [ROT16+rip]
1962 pshufd xmm0, xmm0, 0x93
1963 pshufd xmm3, xmm3, 0x4E
1964 pshufd xmm2, xmm2, 0x39
1985 pshufd xmm0, xmm0, 0x39
1986 pshufd xmm3, xmm3, 0x4E
1987 pshufd xmm2, xmm2, 0x93
1991 shufps xmm8, xmm5, 214
1992 pshufd xmm9, xmm4, 0x0F
1993 pshufd xmm4, xmm8, 0x39
1995 shufps xmm8, xmm7, 250
1996 pblendw xmm9, xmm8, 0xCC
1998 punpcklqdq xmm8, xmm5
1999 pblendw xmm8, xmm6, 0xC0
2000 pshufd xmm8, xmm8, 0x78
2001 punpckhdq xmm5, xmm7
2002 punpckldq xmm6, xmm5
2003 pshufd xmm7, xmm6, 0x1E
2008 movdqu xmm4, xmmword ptr [rdi]
2009 movdqu xmm5, xmmword ptr [rdi+0x10]
2014 movups xmmword ptr [r9], xmm0
2015 movups xmmword ptr [r9+0x10], xmm1
2016 movups xmmword ptr [r9+0x20], xmm2
2017 movups xmmword ptr [r9+0x30], xmm3
2020 .size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
2021 .size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
2022 .size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
2031 .long 0x6A09E667, 0xBB67AE85
2032 .long 0x3C6EF372, 0xA54FF53A
2034 .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2036 .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2042 .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2044 .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2046 .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2048 .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2050 .long 64, 64, 64, 64
2052 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2054 #endif /* HAVE_SSE4_1 */
2057 .section .note.GNU-stack,"",%progbits