]> git.proxmox.com Git - mirror_zfs.git/blob - module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
Introduce BLAKE3 checksums as an OpenZFS feature
[mirror_zfs.git] / module / icp / asm-aarch64 / blake3 / b3_aarch64_sse41.S
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
24 * Copyright (c) 2019-2022 Samuel Neves
25 * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
26 *
27 * This is converted assembly: SSE4.1 -> ARMv8-A
28 * Used tools: SIMDe https://github.com/simd-everywhere/simde
29 */
30
31 #if defined(__aarch64__)
32 .text
33 .section .rodata.cst16,"aM",@progbits,16
34 .p2align 4
35 .LCPI0_0:
36 .byte 2
37 .byte 3
38 .byte 0
39 .byte 1
40 .byte 6
41 .byte 7
42 .byte 4
43 .byte 5
44 .byte 10
45 .byte 11
46 .byte 8
47 .byte 9
48 .byte 14
49 .byte 15
50 .byte 12
51 .byte 13
52 .LCPI0_1:
53 .word 1779033703
54 .word 3144134277
55 .word 1013904242
56 .word 2773480762
57 .LCPI0_2:
58 .byte 1
59 .byte 2
60 .byte 3
61 .byte 0
62 .byte 5
63 .byte 6
64 .byte 7
65 .byte 4
66 .byte 9
67 .byte 10
68 .byte 11
69 .byte 8
70 .byte 13
71 .byte 14
72 .byte 15
73 .byte 12
74 .LCPI0_3:
75 .byte 0
76 .byte 1
77 .byte 2
78 .byte 3
79 .byte 20
80 .byte 21
81 .byte 22
82 .byte 23
83 .byte 8
84 .byte 9
85 .byte 10
86 .byte 11
87 .byte 28
88 .byte 29
89 .byte 30
90 .byte 31
91 .LCPI0_4:
92 .byte 0
93 .byte 1
94 .byte 2
95 .byte 3
96 .byte 4
97 .byte 5
98 .byte 6
99 .byte 7
100 .byte 8
101 .byte 9
102 .byte 10
103 .byte 11
104 .byte 28
105 .byte 29
106 .byte 30
107 .byte 31
108 .text
109 .globl zfs_blake3_compress_in_place_sse41
110 .p2align 2
111 .type zfs_blake3_compress_in_place_sse41,@function
112 zfs_blake3_compress_in_place_sse41:
113 .cfi_startproc
114 ldp q7, q6, [x0]
115 ldp q17, q18, [x1]
116 add x12, x1, #32
117 ld2 { v4.4s, v5.4s }, [x12]
118 lsr x10, x3, #32
119 fmov s16, w3
120 adrp x13, .LCPI0_0
121 adrp x11, .LCPI0_1
122 and w8, w2, #0xff
123 mov v16.s[1], w10
124 ldr q0, [x13, :lo12:.LCPI0_0]
125 ldr q20, [x11, :lo12:.LCPI0_1]
126 adrp x11, .LCPI0_4
127 and w9, w4, #0xff
128 ldr q2, [x11, :lo12:.LCPI0_4]
129 mov v16.s[2], w8
130 uzp1 v21.4s, v17.4s, v18.4s
131 add v7.4s, v6.4s, v7.4s
132 adrp x12, .LCPI0_3
133 mov v16.s[3], w9
134 uzp2 v18.4s, v17.4s, v18.4s
135 add v7.4s, v7.4s, v21.4s
136 ext v17.16b, v5.16b, v5.16b, #12
137 ldr q3, [x12, :lo12:.LCPI0_3]
138 ext v24.16b, v4.16b, v4.16b, #12
139 eor v16.16b, v7.16b, v16.16b
140 mov v27.16b, v17.16b
141 uzp1 v19.4s, v21.4s, v21.4s
142 ext v25.16b, v21.16b, v21.16b, #12
143 zip2 v28.4s, v18.4s, v17.4s
144 tbl v29.16b, { v16.16b }, v0.16b
145 mov v27.s[1], v24.s[2]
146 zip1 v23.2d, v17.2d, v18.2d
147 ext v19.16b, v19.16b, v21.16b, #8
148 add v22.4s, v29.4s, v20.4s
149 ext v26.16b, v21.16b, v25.16b, #12
150 tbl v20.16b, { v23.16b, v24.16b }, v2.16b
151 zip1 v21.4s, v28.4s, v24.4s
152 zip1 v23.4s, v24.4s, v28.4s
153 uzp2 v19.4s, v19.4s, v18.4s
154 eor v24.16b, v22.16b, v6.16b
155 ext v25.16b, v20.16b, v20.16b, #12
156 ext v6.16b, v23.16b, v21.16b, #8
157 add v7.4s, v7.4s, v18.4s
158 ext v18.16b, v19.16b, v19.16b, #4
159 tbl v16.16b, { v26.16b, v27.16b }, v3.16b
160 uzp1 v21.4s, v20.4s, v25.4s
161 mov v26.16b, v6.16b
162 ext v23.16b, v18.16b, v18.16b, #12
163 mov v26.s[1], v21.s[2]
164 adrp x10, .LCPI0_2
165 ext v25.16b, v18.16b, v23.16b, #12
166 uzp1 v23.4s, v18.4s, v18.4s
167 ldr q1, [x10, :lo12:.LCPI0_2]
168 ext v18.16b, v23.16b, v18.16b, #8
169 ushr v23.4s, v24.4s, #12
170 shl v24.4s, v24.4s, #20
171 orr v23.16b, v24.16b, v23.16b
172 add v7.4s, v7.4s, v23.4s
173 eor v27.16b, v29.16b, v7.16b
174 add v4.4s, v7.4s, v4.4s
175 tbl v7.16b, { v25.16b, v26.16b }, v3.16b
176 tbl v26.16b, { v27.16b }, v1.16b
177 add v22.4s, v22.4s, v26.4s
178 uzp2 v18.4s, v18.4s, v16.4s
179 eor v23.16b, v23.16b, v22.16b
180 ext v5.16b, v18.16b, v18.16b, #4
181 ushr v27.4s, v23.4s, #7
182 shl v23.4s, v23.4s, #25
183 uzp1 v25.4s, v5.4s, v5.4s
184 orr v23.16b, v23.16b, v27.16b
185 ext v28.16b, v4.16b, v4.16b, #12
186 ext v4.16b, v25.16b, v5.16b, #8
187 ext v25.16b, v26.16b, v26.16b, #8
188 add v26.4s, v28.4s, v23.4s
189 eor v25.16b, v26.16b, v25.16b
190 ext v22.16b, v22.16b, v22.16b, #4
191 tbl v25.16b, { v25.16b }, v0.16b
192 add v22.4s, v22.4s, v25.4s
193 eor v23.16b, v23.16b, v22.16b
194 add v17.4s, v26.4s, v17.4s
195 ushr v26.4s, v23.4s, #12
196 shl v23.4s, v23.4s, #20
197 orr v23.16b, v23.16b, v26.16b
198 add v17.4s, v17.4s, v23.4s
199 eor v25.16b, v25.16b, v17.16b
200 add v17.4s, v17.4s, v19.4s
201 tbl v19.16b, { v25.16b }, v1.16b
202 add v22.4s, v22.4s, v19.4s
203 eor v23.16b, v23.16b, v22.16b
204 ushr v25.4s, v23.4s, #7
205 shl v23.4s, v23.4s, #25
206 ext v17.16b, v17.16b, v17.16b, #4
207 orr v23.16b, v23.16b, v25.16b
208 ext v19.16b, v19.16b, v19.16b, #8
209 add v17.4s, v17.4s, v23.4s
210 eor v19.16b, v17.16b, v19.16b
211 ext v22.16b, v22.16b, v22.16b, #12
212 tbl v19.16b, { v19.16b }, v0.16b
213 add v22.4s, v22.4s, v19.4s
214 eor v23.16b, v23.16b, v22.16b
215 ushr v25.4s, v23.4s, #12
216 shl v23.4s, v23.4s, #20
217 add v17.4s, v17.4s, v16.4s
218 orr v23.16b, v23.16b, v25.16b
219 add v17.4s, v17.4s, v23.4s
220 ext v25.16b, v17.16b, v17.16b, #12
221 eor v17.16b, v19.16b, v17.16b
222 tbl v17.16b, { v17.16b }, v1.16b
223 add v19.4s, v22.4s, v17.4s
224 eor v22.16b, v23.16b, v19.16b
225 add v25.4s, v25.4s, v21.4s
226 zip1 v20.2d, v6.2d, v16.2d
227 ushr v23.4s, v22.4s, #7
228 shl v22.4s, v22.4s, #25
229 zip2 v24.4s, v16.4s, v6.4s
230 tbl v26.16b, { v20.16b, v21.16b }, v2.16b
231 orr v22.16b, v22.16b, v23.16b
232 zip1 v16.4s, v24.4s, v21.4s
233 zip1 v20.4s, v21.4s, v24.4s
234 ext v21.16b, v26.16b, v26.16b, #12
235 ext v17.16b, v17.16b, v17.16b, #8
236 add v25.4s, v25.4s, v22.4s
237 ext v16.16b, v20.16b, v16.16b, #8
238 uzp1 v21.4s, v26.4s, v21.4s
239 eor v26.16b, v25.16b, v17.16b
240 ext v19.16b, v19.16b, v19.16b, #4
241 tbl v26.16b, { v26.16b }, v0.16b
242 mov v29.16b, v16.16b
243 add v19.4s, v19.4s, v26.4s
244 ext v27.16b, v5.16b, v5.16b, #12
245 mov v29.s[1], v21.s[2]
246 eor v22.16b, v22.16b, v19.16b
247 ext v28.16b, v5.16b, v27.16b, #12
248 ushr v27.4s, v22.4s, #12
249 shl v22.4s, v22.4s, #20
250 add v6.4s, v25.4s, v6.4s
251 orr v22.16b, v22.16b, v27.16b
252 add v6.4s, v6.4s, v22.4s
253 eor v26.16b, v26.16b, v6.16b
254 add v6.4s, v6.4s, v18.4s
255 tbl v18.16b, { v26.16b }, v1.16b
256 add v19.4s, v19.4s, v18.4s
257 eor v22.16b, v22.16b, v19.16b
258 ushr v26.4s, v22.4s, #7
259 shl v22.4s, v22.4s, #25
260 ext v6.16b, v6.16b, v6.16b, #4
261 orr v22.16b, v22.16b, v26.16b
262 ext v18.16b, v18.16b, v18.16b, #8
263 add v6.4s, v6.4s, v22.4s
264 eor v18.16b, v6.16b, v18.16b
265 ext v19.16b, v19.16b, v19.16b, #12
266 tbl v18.16b, { v18.16b }, v0.16b
267 add v19.4s, v19.4s, v18.4s
268 eor v22.16b, v22.16b, v19.16b
269 ushr v26.4s, v22.4s, #12
270 shl v22.4s, v22.4s, #20
271 add v6.4s, v6.4s, v7.4s
272 orr v22.16b, v22.16b, v26.16b
273 add v6.4s, v6.4s, v22.4s
274 ext v26.16b, v6.16b, v6.16b, #12
275 eor v6.16b, v18.16b, v6.16b
276 uzp2 v4.4s, v4.4s, v7.4s
277 zip2 v25.4s, v7.4s, v16.4s
278 add v26.4s, v26.4s, v21.4s
279 zip1 v20.2d, v16.2d, v7.2d
280 tbl v6.16b, { v6.16b }, v1.16b
281 ext v24.16b, v4.16b, v4.16b, #4
282 tbl v27.16b, { v20.16b, v21.16b }, v2.16b
283 zip1 v7.4s, v25.4s, v21.4s
284 zip1 v20.4s, v21.4s, v25.4s
285 add v18.4s, v19.4s, v6.4s
286 uzp1 v5.4s, v24.4s, v24.4s
287 ext v21.16b, v27.16b, v27.16b, #12
288 ext v7.16b, v20.16b, v7.16b, #8
289 eor v19.16b, v22.16b, v18.16b
290 ext v5.16b, v5.16b, v24.16b, #8
291 tbl v17.16b, { v28.16b, v29.16b }, v3.16b
292 uzp1 v21.4s, v27.4s, v21.4s
293 mov v28.16b, v7.16b
294 ushr v22.4s, v19.4s, #7
295 shl v19.4s, v19.4s, #25
296 ext v23.16b, v24.16b, v24.16b, #12
297 uzp2 v5.4s, v5.4s, v17.4s
298 mov v28.s[1], v21.s[2]
299 orr v19.16b, v19.16b, v22.16b
300 ext v27.16b, v24.16b, v23.16b, #12
301 ext v23.16b, v5.16b, v5.16b, #4
302 ext v6.16b, v6.16b, v6.16b, #8
303 ext v25.16b, v18.16b, v18.16b, #4
304 add v18.4s, v26.4s, v19.4s
305 uzp1 v24.4s, v23.4s, v23.4s
306 eor v6.16b, v18.16b, v6.16b
307 ext v24.16b, v24.16b, v23.16b, #8
308 add v16.4s, v18.4s, v16.4s
309 tbl v18.16b, { v27.16b, v28.16b }, v3.16b
310 tbl v27.16b, { v6.16b }, v0.16b
311 uzp2 v6.4s, v24.4s, v18.4s
312 add v24.4s, v25.4s, v27.4s
313 eor v19.16b, v19.16b, v24.16b
314 ushr v25.4s, v19.4s, #12
315 shl v19.4s, v19.4s, #20
316 orr v19.16b, v19.16b, v25.16b
317 add v16.4s, v16.4s, v19.4s
318 eor v25.16b, v27.16b, v16.16b
319 add v4.4s, v16.4s, v4.4s
320 tbl v16.16b, { v25.16b }, v1.16b
321 add v24.4s, v24.4s, v16.4s
322 eor v19.16b, v19.16b, v24.16b
323 ushr v25.4s, v19.4s, #7
324 shl v19.4s, v19.4s, #25
325 ext v4.16b, v4.16b, v4.16b, #4
326 orr v19.16b, v19.16b, v25.16b
327 ext v16.16b, v16.16b, v16.16b, #8
328 add v4.4s, v4.4s, v19.4s
329 eor v16.16b, v4.16b, v16.16b
330 ext v24.16b, v24.16b, v24.16b, #12
331 tbl v25.16b, { v16.16b }, v0.16b
332 add v24.4s, v24.4s, v25.4s
333 eor v16.16b, v19.16b, v24.16b
334 ushr v19.4s, v16.4s, #12
335 shl v16.4s, v16.4s, #20
336 add v4.4s, v4.4s, v17.4s
337 orr v19.16b, v16.16b, v19.16b
338 add v27.4s, v4.4s, v19.4s
339 eor v25.16b, v25.16b, v27.16b
340 tbl v25.16b, { v25.16b }, v1.16b
341 add v24.4s, v24.4s, v25.4s
342 zip2 v26.4s, v17.4s, v7.4s
343 ext v4.16b, v27.16b, v27.16b, #12
344 eor v19.16b, v19.16b, v24.16b
345 add v28.4s, v4.4s, v21.4s
346 zip1 v20.2d, v7.2d, v17.2d
347 zip1 v4.4s, v26.4s, v21.4s
348 zip1 v17.4s, v21.4s, v26.4s
349 ushr v26.4s, v19.4s, #7
350 shl v19.4s, v19.4s, #25
351 orr v19.16b, v19.16b, v26.16b
352 ext v25.16b, v25.16b, v25.16b, #8
353 add v27.4s, v28.4s, v19.4s
354 eor v25.16b, v27.16b, v25.16b
355 ext v24.16b, v24.16b, v24.16b, #4
356 tbl v25.16b, { v25.16b }, v0.16b
357 add v24.4s, v24.4s, v25.4s
358 eor v19.16b, v19.16b, v24.16b
359 add v7.4s, v27.4s, v7.4s
360 ushr v27.4s, v19.4s, #12
361 shl v19.4s, v19.4s, #20
362 orr v19.16b, v19.16b, v27.16b
363 add v7.4s, v7.4s, v19.4s
364 eor v25.16b, v25.16b, v7.16b
365 add v5.4s, v7.4s, v5.4s
366 tbl v7.16b, { v25.16b }, v1.16b
367 add v24.4s, v24.4s, v7.4s
368 eor v19.16b, v19.16b, v24.16b
369 ushr v25.4s, v19.4s, #7
370 shl v19.4s, v19.4s, #25
371 ext v5.16b, v5.16b, v5.16b, #4
372 orr v19.16b, v19.16b, v25.16b
373 ext v7.16b, v7.16b, v7.16b, #8
374 add v5.4s, v5.4s, v19.4s
375 eor v7.16b, v5.16b, v7.16b
376 ext v24.16b, v24.16b, v24.16b, #12
377 tbl v7.16b, { v7.16b }, v0.16b
378 add v24.4s, v24.4s, v7.4s
379 eor v19.16b, v19.16b, v24.16b
380 ushr v25.4s, v19.4s, #12
381 shl v19.4s, v19.4s, #20
382 tbl v16.16b, { v20.16b, v21.16b }, v2.16b
383 add v5.4s, v5.4s, v18.4s
384 orr v19.16b, v19.16b, v25.16b
385 ext v20.16b, v16.16b, v16.16b, #12
386 ext v4.16b, v17.16b, v4.16b, #8
387 add v5.4s, v5.4s, v19.4s
388 uzp1 v21.4s, v16.4s, v20.4s
389 mov v17.16b, v4.16b
390 ext v25.16b, v5.16b, v5.16b, #12
391 mov v17.s[1], v21.s[2]
392 add v25.4s, v25.4s, v21.4s
393 zip1 v20.2d, v4.2d, v18.2d
394 ext v22.16b, v23.16b, v23.16b, #12
395 zip2 v26.4s, v18.4s, v4.4s
396 tbl v18.16b, { v20.16b, v21.16b }, v2.16b
397 eor v5.16b, v7.16b, v5.16b
398 ext v16.16b, v23.16b, v22.16b, #12
399 ext v22.16b, v6.16b, v6.16b, #4
400 zip1 v27.4s, v26.4s, v21.4s
401 zip1 v20.4s, v21.4s, v26.4s
402 ext v21.16b, v18.16b, v18.16b, #12
403 tbl v5.16b, { v5.16b }, v1.16b
404 ext v20.16b, v20.16b, v27.16b, #8
405 uzp1 v27.4s, v18.4s, v21.4s
406 uzp1 v18.4s, v22.4s, v22.4s
407 add v21.4s, v24.4s, v5.4s
408 ext v18.16b, v18.16b, v22.16b, #8
409 eor v19.16b, v19.16b, v21.16b
410 tbl v7.16b, { v16.16b, v17.16b }, v3.16b
411 uzp2 v18.4s, v18.4s, v17.4s
412 zip2 v16.4s, v16.4s, v20.4s
413 ushr v17.4s, v19.4s, #7
414 shl v19.4s, v19.4s, #25
415 orr v17.16b, v19.16b, v17.16b
416 ext v5.16b, v5.16b, v5.16b, #8
417 add v19.4s, v25.4s, v17.4s
418 eor v5.16b, v19.16b, v5.16b
419 ext v21.16b, v21.16b, v21.16b, #4
420 tbl v5.16b, { v5.16b }, v0.16b
421 add v4.4s, v19.4s, v4.4s
422 add v19.4s, v21.4s, v5.4s
423 eor v17.16b, v17.16b, v19.16b
424 ushr v21.4s, v17.4s, #12
425 shl v17.4s, v17.4s, #20
426 orr v17.16b, v17.16b, v21.16b
427 add v4.4s, v4.4s, v17.4s
428 eor v5.16b, v5.16b, v4.16b
429 tbl v5.16b, { v5.16b }, v1.16b
430 add v4.4s, v4.4s, v6.4s
431 add v6.4s, v19.4s, v5.4s
432 eor v17.16b, v17.16b, v6.16b
433 ushr v19.4s, v17.4s, #7
434 shl v17.4s, v17.4s, #25
435 ext v4.16b, v4.16b, v4.16b, #4
436 orr v17.16b, v17.16b, v19.16b
437 ext v5.16b, v5.16b, v5.16b, #8
438 add v4.4s, v4.4s, v17.4s
439 eor v5.16b, v4.16b, v5.16b
440 ext v6.16b, v6.16b, v6.16b, #12
441 tbl v5.16b, { v5.16b }, v0.16b
442 add v6.4s, v6.4s, v5.4s
443 eor v17.16b, v17.16b, v6.16b
444 ushr v19.4s, v17.4s, #12
445 shl v17.4s, v17.4s, #20
446 add v4.4s, v4.4s, v7.4s
447 orr v17.16b, v17.16b, v19.16b
448 add v4.4s, v4.4s, v17.4s
449 eor v5.16b, v5.16b, v4.16b
450 tbl v5.16b, { v5.16b }, v1.16b
451 mov v29.16b, v20.16b
452 ext v4.16b, v4.16b, v4.16b, #12
453 add v6.4s, v6.4s, v5.4s
454 mov v29.s[1], v27.s[2]
455 add v4.4s, v4.4s, v27.4s
456 zip1 v26.2d, v20.2d, v7.2d
457 zip1 v7.4s, v16.4s, v27.4s
458 zip1 v16.4s, v27.4s, v16.4s
459 eor v17.16b, v17.16b, v6.16b
460 ext v7.16b, v16.16b, v7.16b, #8
461 ushr v16.4s, v17.4s, #7
462 shl v17.4s, v17.4s, #25
463 orr v16.16b, v17.16b, v16.16b
464 ext v5.16b, v5.16b, v5.16b, #8
465 add v4.4s, v4.4s, v16.4s
466 eor v5.16b, v4.16b, v5.16b
467 ext v6.16b, v6.16b, v6.16b, #4
468 tbl v5.16b, { v5.16b }, v0.16b
469 add v6.4s, v6.4s, v5.4s
470 eor v16.16b, v16.16b, v6.16b
471 ushr v17.4s, v16.4s, #12
472 shl v16.4s, v16.4s, #20
473 add v4.4s, v4.4s, v20.4s
474 orr v16.16b, v16.16b, v17.16b
475 add v4.4s, v4.4s, v16.4s
476 eor v5.16b, v5.16b, v4.16b
477 tbl v5.16b, { v5.16b }, v1.16b
478 add v6.4s, v6.4s, v5.4s
479 eor v16.16b, v16.16b, v6.16b
480 add v4.4s, v4.4s, v18.4s
481 ushr v17.4s, v16.4s, #7
482 shl v16.4s, v16.4s, #25
483 ext v23.16b, v22.16b, v22.16b, #12
484 ext v4.16b, v4.16b, v4.16b, #4
485 orr v16.16b, v16.16b, v17.16b
486 ext v28.16b, v22.16b, v23.16b, #12
487 ext v5.16b, v5.16b, v5.16b, #8
488 add v4.4s, v16.4s, v4.4s
489 tbl v3.16b, { v28.16b, v29.16b }, v3.16b
490 eor v5.16b, v4.16b, v5.16b
491 ext v6.16b, v6.16b, v6.16b, #12
492 add v3.4s, v4.4s, v3.4s
493 tbl v4.16b, { v5.16b }, v0.16b
494 add v5.4s, v6.4s, v4.4s
495 eor v6.16b, v16.16b, v5.16b
496 ushr v16.4s, v6.4s, #12
497 shl v6.4s, v6.4s, #20
498 orr v6.16b, v6.16b, v16.16b
499 tbl v2.16b, { v26.16b, v27.16b }, v2.16b
500 add v3.4s, v3.4s, v6.4s
501 ext v19.16b, v2.16b, v2.16b, #12
502 eor v4.16b, v4.16b, v3.16b
503 uzp1 v2.4s, v2.4s, v19.4s
504 ext v3.16b, v3.16b, v3.16b, #12
505 tbl v4.16b, { v4.16b }, v1.16b
506 add v2.4s, v3.4s, v2.4s
507 add v3.4s, v5.4s, v4.4s
508 eor v5.16b, v6.16b, v3.16b
509 ushr v6.4s, v5.4s, #7
510 shl v5.4s, v5.4s, #25
511 orr v5.16b, v5.16b, v6.16b
512 ext v4.16b, v4.16b, v4.16b, #8
513 add v2.4s, v2.4s, v5.4s
514 eor v4.16b, v2.16b, v4.16b
515 ext v3.16b, v3.16b, v3.16b, #4
516 tbl v0.16b, { v4.16b }, v0.16b
517 add v3.4s, v3.4s, v0.4s
518 eor v4.16b, v5.16b, v3.16b
519 ushr v5.4s, v4.4s, #12
520 shl v4.4s, v4.4s, #20
521 add v2.4s, v2.4s, v7.4s
522 orr v4.16b, v4.16b, v5.16b
523 add v2.4s, v2.4s, v4.4s
524 eor v0.16b, v0.16b, v2.16b
525 tbl v0.16b, { v0.16b }, v1.16b
526 add v1.4s, v3.4s, v0.4s
527 eor v3.16b, v4.16b, v1.16b
528 ext v2.16b, v2.16b, v2.16b, #4
529 ext v1.16b, v1.16b, v1.16b, #12
530 ushr v4.4s, v3.4s, #7
531 shl v3.4s, v3.4s, #25
532 ext v0.16b, v0.16b, v0.16b, #8
533 eor v1.16b, v2.16b, v1.16b
534 orr v2.16b, v3.16b, v4.16b
535 eor v0.16b, v2.16b, v0.16b
536 stp q1, q0, [x0]
537 ret
538 .Lfunc_end0:
539 .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
540 .cfi_endproc
541
542 .section .rodata.cst16,"aM",@progbits,16
543 .p2align 4
544 .LCPI1_0:
545 .byte 2
546 .byte 3
547 .byte 0
548 .byte 1
549 .byte 6
550 .byte 7
551 .byte 4
552 .byte 5
553 .byte 10
554 .byte 11
555 .byte 8
556 .byte 9
557 .byte 14
558 .byte 15
559 .byte 12
560 .byte 13
561 .LCPI1_1:
562 .word 1779033703
563 .word 3144134277
564 .word 1013904242
565 .word 2773480762
566 .LCPI1_2:
567 .byte 1
568 .byte 2
569 .byte 3
570 .byte 0
571 .byte 5
572 .byte 6
573 .byte 7
574 .byte 4
575 .byte 9
576 .byte 10
577 .byte 11
578 .byte 8
579 .byte 13
580 .byte 14
581 .byte 15
582 .byte 12
583 .LCPI1_3:
584 .byte 0
585 .byte 1
586 .byte 2
587 .byte 3
588 .byte 20
589 .byte 21
590 .byte 22
591 .byte 23
592 .byte 8
593 .byte 9
594 .byte 10
595 .byte 11
596 .byte 28
597 .byte 29
598 .byte 30
599 .byte 31
600 .LCPI1_4:
601 .byte 0
602 .byte 1
603 .byte 2
604 .byte 3
605 .byte 4
606 .byte 5
607 .byte 6
608 .byte 7
609 .byte 8
610 .byte 9
611 .byte 10
612 .byte 11
613 .byte 28
614 .byte 29
615 .byte 30
616 .byte 31
617 .text
618 .globl zfs_blake3_compress_xof_sse41
619 .p2align 2
620 .type zfs_blake3_compress_xof_sse41,@function
621 zfs_blake3_compress_xof_sse41:
622 .cfi_startproc
623 ldp q7, q6, [x0]
624 ldp q17, q18, [x1]
625 add x12, x1, #32
626 ld2 { v4.4s, v5.4s }, [x12]
627 lsr x10, x3, #32
628 fmov s16, w3
629 adrp x13, .LCPI1_0
630 adrp x11, .LCPI1_1
631 and w8, w2, #0xff
632 mov v16.s[1], w10
633 ldr q0, [x13, :lo12:.LCPI1_0]
634 ldr q20, [x11, :lo12:.LCPI1_1]
635 adrp x11, .LCPI1_4
636 and w9, w4, #0xff
637 ldr q2, [x11, :lo12:.LCPI1_4]
638 mov v16.s[2], w8
639 uzp1 v21.4s, v17.4s, v18.4s
640 add v7.4s, v6.4s, v7.4s
641 adrp x12, .LCPI1_3
642 mov v16.s[3], w9
643 uzp2 v18.4s, v17.4s, v18.4s
644 add v7.4s, v7.4s, v21.4s
645 ext v17.16b, v5.16b, v5.16b, #12
646 ldr q3, [x12, :lo12:.LCPI1_3]
647 ext v24.16b, v4.16b, v4.16b, #12
648 eor v16.16b, v7.16b, v16.16b
649 mov v27.16b, v17.16b
650 uzp1 v19.4s, v21.4s, v21.4s
651 ext v25.16b, v21.16b, v21.16b, #12
652 zip2 v28.4s, v18.4s, v17.4s
653 tbl v29.16b, { v16.16b }, v0.16b
654 mov v27.s[1], v24.s[2]
655 zip1 v23.2d, v17.2d, v18.2d
656 ext v19.16b, v19.16b, v21.16b, #8
657 add v22.4s, v29.4s, v20.4s
658 ext v26.16b, v21.16b, v25.16b, #12
659 tbl v20.16b, { v23.16b, v24.16b }, v2.16b
660 zip1 v21.4s, v28.4s, v24.4s
661 zip1 v23.4s, v24.4s, v28.4s
662 uzp2 v19.4s, v19.4s, v18.4s
663 eor v24.16b, v22.16b, v6.16b
664 ext v25.16b, v20.16b, v20.16b, #12
665 ext v6.16b, v23.16b, v21.16b, #8
666 add v7.4s, v7.4s, v18.4s
667 ext v18.16b, v19.16b, v19.16b, #4
668 tbl v16.16b, { v26.16b, v27.16b }, v3.16b
669 uzp1 v21.4s, v20.4s, v25.4s
670 mov v26.16b, v6.16b
671 ext v23.16b, v18.16b, v18.16b, #12
672 mov v26.s[1], v21.s[2]
673 adrp x10, .LCPI1_2
674 ext v25.16b, v18.16b, v23.16b, #12
675 uzp1 v23.4s, v18.4s, v18.4s
676 ldr q1, [x10, :lo12:.LCPI1_2]
677 ext v18.16b, v23.16b, v18.16b, #8
678 ushr v23.4s, v24.4s, #12
679 shl v24.4s, v24.4s, #20
680 orr v23.16b, v24.16b, v23.16b
681 add v7.4s, v7.4s, v23.4s
682 eor v27.16b, v29.16b, v7.16b
683 add v4.4s, v7.4s, v4.4s
684 tbl v7.16b, { v25.16b, v26.16b }, v3.16b
685 tbl v26.16b, { v27.16b }, v1.16b
686 add v22.4s, v22.4s, v26.4s
687 uzp2 v18.4s, v18.4s, v16.4s
688 eor v23.16b, v23.16b, v22.16b
689 ext v5.16b, v18.16b, v18.16b, #4
690 ushr v27.4s, v23.4s, #7
691 shl v23.4s, v23.4s, #25
692 uzp1 v25.4s, v5.4s, v5.4s
693 orr v23.16b, v23.16b, v27.16b
694 ext v28.16b, v4.16b, v4.16b, #12
695 ext v4.16b, v25.16b, v5.16b, #8
696 ext v25.16b, v26.16b, v26.16b, #8
697 add v26.4s, v28.4s, v23.4s
698 eor v25.16b, v26.16b, v25.16b
699 ext v22.16b, v22.16b, v22.16b, #4
700 tbl v25.16b, { v25.16b }, v0.16b
701 add v22.4s, v22.4s, v25.4s
702 eor v23.16b, v23.16b, v22.16b
703 add v17.4s, v26.4s, v17.4s
704 ushr v26.4s, v23.4s, #12
705 shl v23.4s, v23.4s, #20
706 orr v23.16b, v23.16b, v26.16b
707 add v17.4s, v17.4s, v23.4s
708 eor v25.16b, v25.16b, v17.16b
709 add v17.4s, v17.4s, v19.4s
710 tbl v19.16b, { v25.16b }, v1.16b
711 add v22.4s, v22.4s, v19.4s
712 eor v23.16b, v23.16b, v22.16b
713 ushr v25.4s, v23.4s, #7
714 shl v23.4s, v23.4s, #25
715 ext v17.16b, v17.16b, v17.16b, #4
716 orr v23.16b, v23.16b, v25.16b
717 ext v19.16b, v19.16b, v19.16b, #8
718 add v17.4s, v17.4s, v23.4s
719 eor v19.16b, v17.16b, v19.16b
720 ext v22.16b, v22.16b, v22.16b, #12
721 tbl v19.16b, { v19.16b }, v0.16b
722 add v22.4s, v22.4s, v19.4s
723 eor v23.16b, v23.16b, v22.16b
724 ushr v25.4s, v23.4s, #12
725 shl v23.4s, v23.4s, #20
726 add v17.4s, v17.4s, v16.4s
727 orr v23.16b, v23.16b, v25.16b
728 add v17.4s, v17.4s, v23.4s
729 ext v25.16b, v17.16b, v17.16b, #12
730 eor v17.16b, v19.16b, v17.16b
731 tbl v17.16b, { v17.16b }, v1.16b
732 add v19.4s, v22.4s, v17.4s
733 eor v22.16b, v23.16b, v19.16b
734 add v25.4s, v25.4s, v21.4s
735 zip1 v20.2d, v6.2d, v16.2d
736 ushr v23.4s, v22.4s, #7
737 shl v22.4s, v22.4s, #25
738 zip2 v24.4s, v16.4s, v6.4s
739 tbl v26.16b, { v20.16b, v21.16b }, v2.16b
740 orr v22.16b, v22.16b, v23.16b
741 zip1 v16.4s, v24.4s, v21.4s
742 zip1 v20.4s, v21.4s, v24.4s
743 ext v21.16b, v26.16b, v26.16b, #12
744 ext v17.16b, v17.16b, v17.16b, #8
745 add v25.4s, v25.4s, v22.4s
746 ext v16.16b, v20.16b, v16.16b, #8
747 uzp1 v21.4s, v26.4s, v21.4s
748 eor v26.16b, v25.16b, v17.16b
749 ext v19.16b, v19.16b, v19.16b, #4
750 tbl v26.16b, { v26.16b }, v0.16b
751 mov v29.16b, v16.16b
752 add v19.4s, v19.4s, v26.4s
753 ext v27.16b, v5.16b, v5.16b, #12
754 mov v29.s[1], v21.s[2]
755 eor v22.16b, v22.16b, v19.16b
756 ext v28.16b, v5.16b, v27.16b, #12
757 ushr v27.4s, v22.4s, #12
758 shl v22.4s, v22.4s, #20
759 add v6.4s, v25.4s, v6.4s
760 orr v22.16b, v22.16b, v27.16b
761 add v6.4s, v6.4s, v22.4s
762 eor v26.16b, v26.16b, v6.16b
763 add v6.4s, v6.4s, v18.4s
764 tbl v18.16b, { v26.16b }, v1.16b
765 add v19.4s, v19.4s, v18.4s
766 eor v22.16b, v22.16b, v19.16b
767 ushr v26.4s, v22.4s, #7
768 shl v22.4s, v22.4s, #25
769 ext v6.16b, v6.16b, v6.16b, #4
770 orr v22.16b, v22.16b, v26.16b
771 ext v18.16b, v18.16b, v18.16b, #8
772 add v6.4s, v6.4s, v22.4s
773 eor v18.16b, v6.16b, v18.16b
774 ext v19.16b, v19.16b, v19.16b, #12
775 tbl v18.16b, { v18.16b }, v0.16b
776 add v19.4s, v19.4s, v18.4s
777 eor v22.16b, v22.16b, v19.16b
778 ushr v26.4s, v22.4s, #12
779 shl v22.4s, v22.4s, #20
780 add v6.4s, v6.4s, v7.4s
781 orr v22.16b, v22.16b, v26.16b
782 add v6.4s, v6.4s, v22.4s
783 ext v26.16b, v6.16b, v6.16b, #12
784 eor v6.16b, v18.16b, v6.16b
785 uzp2 v4.4s, v4.4s, v7.4s
786 zip2 v25.4s, v7.4s, v16.4s
787 add v26.4s, v26.4s, v21.4s
788 zip1 v20.2d, v16.2d, v7.2d
789 tbl v6.16b, { v6.16b }, v1.16b
790 ext v24.16b, v4.16b, v4.16b, #4
791 tbl v27.16b, { v20.16b, v21.16b }, v2.16b
792 zip1 v7.4s, v25.4s, v21.4s
793 zip1 v20.4s, v21.4s, v25.4s
794 add v18.4s, v19.4s, v6.4s
795 uzp1 v5.4s, v24.4s, v24.4s
796 ext v21.16b, v27.16b, v27.16b, #12
797 ext v7.16b, v20.16b, v7.16b, #8
798 eor v19.16b, v22.16b, v18.16b
799 ext v5.16b, v5.16b, v24.16b, #8
800 tbl v17.16b, { v28.16b, v29.16b }, v3.16b
801 uzp1 v21.4s, v27.4s, v21.4s
802 mov v28.16b, v7.16b
803 ushr v22.4s, v19.4s, #7
804 shl v19.4s, v19.4s, #25
805 ext v23.16b, v24.16b, v24.16b, #12
806 uzp2 v5.4s, v5.4s, v17.4s
807 mov v28.s[1], v21.s[2]
808 orr v19.16b, v19.16b, v22.16b
809 ext v27.16b, v24.16b, v23.16b, #12
810 ext v23.16b, v5.16b, v5.16b, #4
811 ext v6.16b, v6.16b, v6.16b, #8
812 ext v25.16b, v18.16b, v18.16b, #4
813 add v18.4s, v26.4s, v19.4s
814 uzp1 v24.4s, v23.4s, v23.4s
815 eor v6.16b, v18.16b, v6.16b
816 ext v24.16b, v24.16b, v23.16b, #8
817 add v16.4s, v18.4s, v16.4s
818 tbl v18.16b, { v27.16b, v28.16b }, v3.16b
819 tbl v27.16b, { v6.16b }, v0.16b
820 uzp2 v6.4s, v24.4s, v18.4s
821 add v24.4s, v25.4s, v27.4s
822 eor v19.16b, v19.16b, v24.16b
823 ushr v25.4s, v19.4s, #12
824 shl v19.4s, v19.4s, #20
825 orr v19.16b, v19.16b, v25.16b
826 add v16.4s, v16.4s, v19.4s
827 eor v25.16b, v27.16b, v16.16b
828 add v4.4s, v16.4s, v4.4s
829 tbl v16.16b, { v25.16b }, v1.16b
830 add v24.4s, v24.4s, v16.4s
831 eor v19.16b, v19.16b, v24.16b
832 ushr v25.4s, v19.4s, #7
833 shl v19.4s, v19.4s, #25
834 ext v4.16b, v4.16b, v4.16b, #4
835 orr v19.16b, v19.16b, v25.16b
836 ext v16.16b, v16.16b, v16.16b, #8
837 add v4.4s, v4.4s, v19.4s
838 eor v16.16b, v4.16b, v16.16b
839 ext v24.16b, v24.16b, v24.16b, #12
840 tbl v25.16b, { v16.16b }, v0.16b
841 add v24.4s, v24.4s, v25.4s
842 eor v16.16b, v19.16b, v24.16b
843 ushr v19.4s, v16.4s, #12
844 shl v16.4s, v16.4s, #20
845 add v4.4s, v4.4s, v17.4s
846 orr v19.16b, v16.16b, v19.16b
847 add v27.4s, v4.4s, v19.4s
848 eor v25.16b, v25.16b, v27.16b
849 tbl v25.16b, { v25.16b }, v1.16b
850 add v24.4s, v24.4s, v25.4s
851 zip2 v26.4s, v17.4s, v7.4s
852 ext v4.16b, v27.16b, v27.16b, #12
853 eor v19.16b, v19.16b, v24.16b
854 add v28.4s, v4.4s, v21.4s
855 zip1 v20.2d, v7.2d, v17.2d
856 zip1 v4.4s, v26.4s, v21.4s
857 zip1 v17.4s, v21.4s, v26.4s
858 ushr v26.4s, v19.4s, #7
859 shl v19.4s, v19.4s, #25
860 orr v19.16b, v19.16b, v26.16b
861 ext v25.16b, v25.16b, v25.16b, #8
862 add v27.4s, v28.4s, v19.4s
863 eor v25.16b, v27.16b, v25.16b
864 ext v24.16b, v24.16b, v24.16b, #4
865 tbl v25.16b, { v25.16b }, v0.16b
866 add v24.4s, v24.4s, v25.4s
867 eor v19.16b, v19.16b, v24.16b
868 add v7.4s, v27.4s, v7.4s
869 ushr v27.4s, v19.4s, #12
870 shl v19.4s, v19.4s, #20
871 orr v19.16b, v19.16b, v27.16b
872 add v7.4s, v7.4s, v19.4s
873 eor v25.16b, v25.16b, v7.16b
874 add v5.4s, v7.4s, v5.4s
875 tbl v7.16b, { v25.16b }, v1.16b
876 add v24.4s, v24.4s, v7.4s
877 eor v19.16b, v19.16b, v24.16b
878 ushr v25.4s, v19.4s, #7
879 shl v19.4s, v19.4s, #25
880 ext v5.16b, v5.16b, v5.16b, #4
881 orr v19.16b, v19.16b, v25.16b
882 ext v7.16b, v7.16b, v7.16b, #8
883 add v5.4s, v5.4s, v19.4s
884 eor v7.16b, v5.16b, v7.16b
885 ext v24.16b, v24.16b, v24.16b, #12
886 tbl v7.16b, { v7.16b }, v0.16b
887 add v24.4s, v24.4s, v7.4s
888 eor v19.16b, v19.16b, v24.16b
889 ushr v25.4s, v19.4s, #12
890 shl v19.4s, v19.4s, #20
891 tbl v16.16b, { v20.16b, v21.16b }, v2.16b
892 add v5.4s, v5.4s, v18.4s
893 orr v19.16b, v19.16b, v25.16b
894 ext v20.16b, v16.16b, v16.16b, #12
895 ext v4.16b, v17.16b, v4.16b, #8
896 add v5.4s, v5.4s, v19.4s
897 uzp1 v21.4s, v16.4s, v20.4s
898 mov v17.16b, v4.16b
899 ext v25.16b, v5.16b, v5.16b, #12
900 mov v17.s[1], v21.s[2]
901 add v25.4s, v25.4s, v21.4s
902 zip1 v20.2d, v4.2d, v18.2d
903 ext v22.16b, v23.16b, v23.16b, #12
904 zip2 v26.4s, v18.4s, v4.4s
905 tbl v18.16b, { v20.16b, v21.16b }, v2.16b
906 eor v5.16b, v7.16b, v5.16b
907 ext v16.16b, v23.16b, v22.16b, #12
908 ext v22.16b, v6.16b, v6.16b, #4
909 zip1 v27.4s, v26.4s, v21.4s
910 zip1 v20.4s, v21.4s, v26.4s
911 ext v21.16b, v18.16b, v18.16b, #12
912 tbl v5.16b, { v5.16b }, v1.16b
913 ext v20.16b, v20.16b, v27.16b, #8
914 uzp1 v27.4s, v18.4s, v21.4s
915 uzp1 v18.4s, v22.4s, v22.4s
916 add v21.4s, v24.4s, v5.4s
917 ext v18.16b, v18.16b, v22.16b, #8
918 eor v19.16b, v19.16b, v21.16b
919 tbl v7.16b, { v16.16b, v17.16b }, v3.16b
920 uzp2 v18.4s, v18.4s, v17.4s
921 zip2 v16.4s, v16.4s, v20.4s
922 ushr v17.4s, v19.4s, #7
923 shl v19.4s, v19.4s, #25
924 orr v17.16b, v19.16b, v17.16b
925 ext v5.16b, v5.16b, v5.16b, #8
926 add v19.4s, v25.4s, v17.4s
927 eor v5.16b, v19.16b, v5.16b
928 ext v21.16b, v21.16b, v21.16b, #4
929 tbl v5.16b, { v5.16b }, v0.16b
930 add v4.4s, v19.4s, v4.4s
931 add v19.4s, v21.4s, v5.4s
932 eor v17.16b, v17.16b, v19.16b
933 ushr v21.4s, v17.4s, #12
934 shl v17.4s, v17.4s, #20
935 orr v17.16b, v17.16b, v21.16b
936 add v4.4s, v4.4s, v17.4s
937 eor v5.16b, v5.16b, v4.16b
938 tbl v5.16b, { v5.16b }, v1.16b
939 add v4.4s, v4.4s, v6.4s
940 add v6.4s, v19.4s, v5.4s
941 eor v17.16b, v17.16b, v6.16b
942 ushr v19.4s, v17.4s, #7
943 shl v17.4s, v17.4s, #25
944 ext v4.16b, v4.16b, v4.16b, #4
945 orr v17.16b, v17.16b, v19.16b
946 ext v5.16b, v5.16b, v5.16b, #8
947 add v4.4s, v4.4s, v17.4s
948 eor v5.16b, v4.16b, v5.16b
949 ext v6.16b, v6.16b, v6.16b, #12
950 tbl v5.16b, { v5.16b }, v0.16b
951 add v6.4s, v6.4s, v5.4s
952 eor v17.16b, v17.16b, v6.16b
953 ushr v19.4s, v17.4s, #12
954 shl v17.4s, v17.4s, #20
955 add v4.4s, v4.4s, v7.4s
956 orr v17.16b, v17.16b, v19.16b
957 add v4.4s, v4.4s, v17.4s
958 eor v5.16b, v5.16b, v4.16b
959 tbl v5.16b, { v5.16b }, v1.16b
960 mov v29.16b, v20.16b
961 ext v4.16b, v4.16b, v4.16b, #12
962 add v6.4s, v6.4s, v5.4s
963 mov v29.s[1], v27.s[2]
964 add v4.4s, v4.4s, v27.4s
965 zip1 v26.2d, v20.2d, v7.2d
966 zip1 v7.4s, v16.4s, v27.4s
967 zip1 v16.4s, v27.4s, v16.4s
968 eor v17.16b, v17.16b, v6.16b
969 ext v7.16b, v16.16b, v7.16b, #8
970 ushr v16.4s, v17.4s, #7
971 shl v17.4s, v17.4s, #25
972 orr v16.16b, v17.16b, v16.16b
973 ext v5.16b, v5.16b, v5.16b, #8
974 add v4.4s, v4.4s, v16.4s
975 eor v5.16b, v4.16b, v5.16b
976 ext v6.16b, v6.16b, v6.16b, #4
977 tbl v5.16b, { v5.16b }, v0.16b
978 add v6.4s, v6.4s, v5.4s
979 eor v16.16b, v16.16b, v6.16b
980 ushr v17.4s, v16.4s, #12
981 shl v16.4s, v16.4s, #20
982 add v4.4s, v4.4s, v20.4s
983 orr v16.16b, v16.16b, v17.16b
984 add v4.4s, v4.4s, v16.4s
985 eor v5.16b, v5.16b, v4.16b
986 tbl v5.16b, { v5.16b }, v1.16b
987 add v6.4s, v6.4s, v5.4s
988 eor v16.16b, v16.16b, v6.16b
989 add v4.4s, v4.4s, v18.4s
990 ushr v17.4s, v16.4s, #7
991 shl v16.4s, v16.4s, #25
992 ext v23.16b, v22.16b, v22.16b, #12
993 ext v4.16b, v4.16b, v4.16b, #4
994 orr v16.16b, v16.16b, v17.16b
995 ext v28.16b, v22.16b, v23.16b, #12
996 ext v5.16b, v5.16b, v5.16b, #8
997 add v4.4s, v16.4s, v4.4s
998 tbl v3.16b, { v28.16b, v29.16b }, v3.16b
999 eor v5.16b, v4.16b, v5.16b
1000 ext v6.16b, v6.16b, v6.16b, #12
1001 add v3.4s, v4.4s, v3.4s
1002 tbl v4.16b, { v5.16b }, v0.16b
1003 add v5.4s, v6.4s, v4.4s
1004 eor v6.16b, v16.16b, v5.16b
1005 ushr v16.4s, v6.4s, #12
1006 shl v6.4s, v6.4s, #20
1007 orr v6.16b, v6.16b, v16.16b
1008 tbl v2.16b, { v26.16b, v27.16b }, v2.16b
1009 add v3.4s, v3.4s, v6.4s
1010 ext v19.16b, v2.16b, v2.16b, #12
1011 eor v4.16b, v4.16b, v3.16b
1012 uzp1 v2.4s, v2.4s, v19.4s
1013 ext v3.16b, v3.16b, v3.16b, #12
1014 tbl v4.16b, { v4.16b }, v1.16b
1015 add v2.4s, v3.4s, v2.4s
1016 add v3.4s, v5.4s, v4.4s
1017 eor v5.16b, v6.16b, v3.16b
1018 ushr v6.4s, v5.4s, #7
1019 shl v5.4s, v5.4s, #25
1020 orr v5.16b, v5.16b, v6.16b
1021 ext v4.16b, v4.16b, v4.16b, #8
1022 add v2.4s, v2.4s, v5.4s
1023 eor v4.16b, v2.16b, v4.16b
1024 ext v3.16b, v3.16b, v3.16b, #4
1025 tbl v0.16b, { v4.16b }, v0.16b
1026 add v3.4s, v3.4s, v0.4s
1027 eor v4.16b, v5.16b, v3.16b
1028 ushr v5.4s, v4.4s, #12
1029 shl v4.4s, v4.4s, #20
1030 add v2.4s, v2.4s, v7.4s
1031 orr v4.16b, v4.16b, v5.16b
1032 add v2.4s, v2.4s, v4.4s
1033 eor v0.16b, v0.16b, v2.16b
1034 tbl v0.16b, { v0.16b }, v1.16b
1035 add v1.4s, v3.4s, v0.4s
1036 eor v3.16b, v4.16b, v1.16b
1037 ushr v4.4s, v3.4s, #7
1038 shl v3.4s, v3.4s, #25
1039 ext v2.16b, v2.16b, v2.16b, #4
1040 ext v0.16b, v0.16b, v0.16b, #8
1041 ext v1.16b, v1.16b, v1.16b, #12
1042 orr v3.16b, v3.16b, v4.16b
1043 eor v2.16b, v2.16b, v1.16b
1044 eor v3.16b, v3.16b, v0.16b
1045 stp q2, q3, [x5]
1046 ldr q2, [x0]
1047 eor v1.16b, v2.16b, v1.16b
1048 str q1, [x5, #32]
1049 ldr q1, [x0, #16]
1050 eor v0.16b, v1.16b, v0.16b
1051 str q0, [x5, #48]
1052 ret
1053 .Lfunc_end1:
1054 .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
1055 .cfi_endproc
1056
1057 .section .rodata.cst16,"aM",@progbits,16
1058 .p2align 4
1059 .LCPI2_0:
1060 .word 0
1061 .word 1
1062 .word 2
1063 .word 3
1064 .LCPI2_1:
1065 .byte 2
1066 .byte 3
1067 .byte 0
1068 .byte 1
1069 .byte 6
1070 .byte 7
1071 .byte 4
1072 .byte 5
1073 .byte 10
1074 .byte 11
1075 .byte 8
1076 .byte 9
1077 .byte 14
1078 .byte 15
1079 .byte 12
1080 .byte 13
1081 .LCPI2_2:
1082 .byte 1
1083 .byte 2
1084 .byte 3
1085 .byte 0
1086 .byte 5
1087 .byte 6
1088 .byte 7
1089 .byte 4
1090 .byte 9
1091 .byte 10
1092 .byte 11
1093 .byte 8
1094 .byte 13
1095 .byte 14
1096 .byte 15
1097 .byte 12
1098 .text
1099 .globl zfs_blake3_hash_many_sse41
1100 .p2align 2
1101 .type zfs_blake3_hash_many_sse41,@function
1102 zfs_blake3_hash_many_sse41:
1103 .cfi_startproc
1104 stp d15, d14, [sp, #-160]!
1105 stp d13, d12, [sp, #16]
1106 stp d11, d10, [sp, #32]
1107 stp d9, d8, [sp, #48]
1108 stp x29, x30, [sp, #64]
1109 stp x28, x27, [sp, #80]
1110 stp x26, x25, [sp, #96]
1111 stp x24, x23, [sp, #112]
1112 stp x22, x21, [sp, #128]
1113 stp x20, x19, [sp, #144]
1114 mov x29, sp
1115 sub sp, sp, #448
1116 .cfi_def_cfa w29, 160
1117 .cfi_offset w19, -8
1118 .cfi_offset w20, -16
1119 .cfi_offset w21, -24
1120 .cfi_offset w22, -32
1121 .cfi_offset w23, -40
1122 .cfi_offset w24, -48
1123 .cfi_offset w25, -56
1124 .cfi_offset w26, -64
1125 .cfi_offset w27, -72
1126 .cfi_offset w28, -80
1127 .cfi_offset w30, -88
1128 .cfi_offset w29, -96
1129 .cfi_offset b8, -104
1130 .cfi_offset b9, -112
1131 .cfi_offset b10, -120
1132 .cfi_offset b11, -128
1133 .cfi_offset b12, -136
1134 .cfi_offset b13, -144
1135 .cfi_offset b14, -152
1136 .cfi_offset b15, -160
1137 ldr x26, [x29, #168]
1138 ldrb w27, [x29, #160]
1139 mov w19, w6
1140 mov x20, x4
1141 mov x22, x2
1142 mov x28, x1
1143 cmp x1, #4
1144 mov x24, x0
1145 str x3, [sp, #40]
1146 b.lo .LBB2_8
1147 adrp x11, .LCPI2_0
1148 ldr q0, [x11, :lo12:.LCPI2_0]
1149 sbfx w13, w5, #0, #1
1150 dup v1.4s, w13
1151 mov w10, #58983
1152 mov w11, #44677
1153 mov w12, #62322
1154 and v0.16b, v1.16b, v0.16b
1155 mov w13, #62778
1156 orr w8, w7, w19
1157 adrp x9, .LCPI2_1
1158 movk w10, #27145, lsl #16
1159 movk w11, #47975, lsl #16
1160 movk w12, #15470, lsl #16
1161 movk w13, #42319, lsl #16
1162 str q0, [sp, #16]
1163 orr v0.4s, #128, lsl #24
1164 adrp x14, .LCPI2_2
1165 str q0, [sp]
1166 .LBB2_2:
1167 ldr x2, [sp, #40]
1168 mov x15, x2
1169 ld1r { v7.4s }, [x15], #4
1170 add x16, x2, #8
1171 add x17, x2, #12
1172 add x18, x2, #16
1173 add x0, x2, #20
1174 add x3, x2, #24
1175 add x2, x2, #28
1176 ld1r { v6.4s }, [x16]
1177 ld1r { v17.4s }, [x17]
1178 ld1r { v10.4s }, [x18]
1179 ld1r { v11.4s }, [x0]
1180 ld1r { v19.4s }, [x3]
1181 ld1r { v18.4s }, [x15]
1182 ld1r { v16.4s }, [x2]
1183 cbz x22, .LBB2_7
1184 ldr q1, [sp, #16]
1185 dup v0.4s, w20
1186 ldp x15, x16, [x24]
1187 ldp x17, x18, [x24, #16]
1188 add v1.4s, v0.4s, v1.4s
1189 movi v0.4s, #128, lsl #24
1190 str q1, [sp, #64]
1191 eor v0.16b, v1.16b, v0.16b
1192 ldr q1, [sp]
1193 lsr x2, x20, #32
1194 mov x0, xzr
1195 mov w6, w8
1196 cmgt v0.4s, v1.4s, v0.4s
1197 dup v1.4s, w2
1198 sub v0.4s, v1.4s, v0.4s
1199 str q0, [sp, #48]
1200 .LBB2_4:
1201 mov w4, #16
1202 stp q16, q17, [sp, #192]
1203 bfi x4, x0, #6, #58
1204 ldr q1, [x15, x4]
1205 ldr q3, [x16, x4]
1206 ldr q2, [x17, x4]
1207 ldr q4, [x18, x4]
1208 mov w4, #32
1209 bfi x4, x0, #6, #58
1210 ldr q5, [x15, x4]
1211 ldr q20, [x16, x4]
1212 ldr q21, [x17, x4]
1213 ldr q22, [x18, x4]
1214 mov w4, #48
1215 lsl x3, x0, #6
1216 bfi x4, x0, #6, #58
1217 add x0, x0, #1
1218 ldr q0, [x15, x3]
1219 ldr q23, [x16, x3]
1220 ldr q16, [x17, x3]
1221 ldr q17, [x18, x3]
1222 cmp x0, x22
1223 ldr q25, [x15, x4]
1224 ldr q14, [x16, x4]
1225 ldr q28, [x17, x4]
1226 ldr q31, [x18, x4]
1227 csel w4, w27, wzr, eq
1228 orr w4, w4, w6
1229 mov x2, xzr
1230 and w6, w4, #0xff
1231 add x3, x3, #256
1232 .LBB2_5:
1233 ldr x4, [x24, x2]
1234 add x2, x2, #8
1235 cmp x2, #32
1236 add x4, x4, x3
1237 prfm pldl1keep, [x4]
1238 b.ne .LBB2_5
1239 zip1 v29.4s, v0.4s, v23.4s
1240 zip2 v23.4s, v0.4s, v23.4s
1241 zip1 v0.4s, v16.4s, v17.4s
1242 zip2 v24.4s, v16.4s, v17.4s
1243 zip1 v9.4s, v1.4s, v3.4s
1244 zip2 v26.4s, v1.4s, v3.4s
1245 zip1 v27.4s, v2.4s, v4.4s
1246 zip2 v17.4s, v2.4s, v4.4s
1247 zip1 v12.4s, v21.4s, v22.4s
1248 zip2 v13.4s, v21.4s, v22.4s
1249 add v2.4s, v7.4s, v10.4s
1250 add v1.4s, v18.4s, v11.4s
1251 ext v7.16b, v0.16b, v29.16b, #8
1252 ext v22.16b, v24.16b, v23.16b, #8
1253 zip1 v30.4s, v5.4s, v20.4s
1254 zip2 v20.4s, v5.4s, v20.4s
1255 stp q1, q2, [sp, #112]
1256 ext v2.16b, v29.16b, v7.16b, #8
1257 mov v29.d[1], v0.d[0]
1258 ext v18.16b, v23.16b, v22.16b, #8
1259 mov v23.d[1], v24.d[0]
1260 zip1 v21.4s, v25.4s, v14.4s
1261 zip2 v4.4s, v25.4s, v14.4s
1262 zip1 v14.4s, v28.4s, v31.4s
1263 zip2 v15.4s, v28.4s, v31.4s
1264 add v8.4s, v6.4s, v19.4s
1265 ext v28.16b, v27.16b, v9.16b, #8
1266 ext v31.16b, v17.16b, v26.16b, #8
1267 stur q2, [x29, #-208]
1268 mov v7.16b, v29.16b
1269 ext v0.16b, v12.16b, v30.16b, #8
1270 stp q23, q29, [x29, #-80]
1271 mov v2.16b, v19.16b
1272 ext v19.16b, v13.16b, v20.16b, #8
1273 mov v29.16b, v9.16b
1274 ext v25.16b, v9.16b, v28.16b, #8
1275 mov v29.d[1], v27.d[0]
1276 ext v24.16b, v26.16b, v31.16b, #8
1277 mov v26.d[1], v17.d[0]
1278 ext v17.16b, v15.16b, v4.16b, #8
1279 ext v27.16b, v30.16b, v0.16b, #8
1280 ext v0.16b, v20.16b, v19.16b, #8
1281 stp q0, q25, [sp, #80]
1282 ext v0.16b, v4.16b, v17.16b, #8
1283 str q0, [sp, #224]
1284 ldr q0, [sp, #128]
1285 mov v6.16b, v23.16b
1286 mov v22.16b, v4.16b
1287 ldr q16, [x9, :lo12:.LCPI2_1]
1288 add v17.4s, v0.4s, v7.4s
1289 ldr q0, [sp, #112]
1290 mov v30.d[1], v12.d[0]
1291 add v7.4s, v8.4s, v29.4s
1292 mov v20.d[1], v13.d[0]
1293 add v4.4s, v0.4s, v6.4s
1294 ldr q0, [sp, #64]
1295 dup v3.4s, w12
1296 ext v28.16b, v14.16b, v21.16b, #8
1297 dup v1.4s, w10
1298 eor v19.16b, v17.16b, v0.16b
1299 ldr q0, [sp, #48]
1300 ext v23.16b, v21.16b, v28.16b, #8
1301 mov v21.d[1], v14.d[0]
1302 tbl v14.16b, { v19.16b }, v16.16b
1303 eor v12.16b, v4.16b, v0.16b
1304 movi v0.4s, #64
1305 eor v13.16b, v7.16b, v0.16b
1306 tbl v13.16b, { v13.16b }, v16.16b
1307 add v6.4s, v13.4s, v3.4s
1308 dup v5.4s, w11
1309 tbl v12.16b, { v12.16b }, v16.16b
1310 add v1.4s, v14.4s, v1.4s
1311 eor v9.16b, v6.16b, v2.16b
1312 ldp q2, q0, [sp, #192]
1313 add v5.4s, v12.4s, v5.4s
1314 eor v19.16b, v1.16b, v10.16b
1315 eor v10.16b, v5.16b, v11.16b
1316 ushr v11.4s, v19.4s, #12
1317 shl v19.4s, v19.4s, #20
1318 orr v11.16b, v19.16b, v11.16b
1319 ushr v19.4s, v10.4s, #12
1320 shl v10.4s, v10.4s, #20
1321 mov v22.d[1], v15.d[0]
1322 orr v10.16b, v10.16b, v19.16b
1323 ushr v19.4s, v9.4s, #12
1324 shl v9.4s, v9.4s, #20
1325 add v15.4s, v0.4s, v2.4s
1326 orr v9.16b, v9.16b, v19.16b
1327 dup v19.4s, w6
1328 add v15.4s, v15.4s, v26.4s
1329 eor v19.16b, v15.16b, v19.16b
1330 tbl v3.16b, { v19.16b }, v16.16b
1331 dup v19.4s, w13
1332 add v8.4s, v3.4s, v19.4s
1333 ldur q31, [x29, #-208]
1334 eor v19.16b, v8.16b, v2.16b
1335 ushr v0.4s, v19.4s, #12
1336 shl v19.4s, v19.4s, #20
1337 orr v2.16b, v19.16b, v0.16b
1338 ldr q19, [x14, :lo12:.LCPI2_2]
1339 add v17.4s, v17.4s, v31.4s
1340 add v17.4s, v17.4s, v11.4s
1341 eor v14.16b, v14.16b, v17.16b
1342 tbl v14.16b, { v14.16b }, v19.16b
1343 add v1.4s, v1.4s, v14.4s
1344 eor v11.16b, v1.16b, v11.16b
1345 add v4.4s, v4.4s, v18.4s
1346 ushr v0.4s, v11.4s, #7
1347 shl v11.4s, v11.4s, #25
1348 add v4.4s, v4.4s, v10.4s
1349 orr v0.16b, v11.16b, v0.16b
1350 eor v11.16b, v12.16b, v4.16b
1351 tbl v11.16b, { v11.16b }, v19.16b
1352 add v5.4s, v5.4s, v11.4s
1353 eor v10.16b, v5.16b, v10.16b
1354 add v7.4s, v7.4s, v25.4s
1355 ushr v12.4s, v10.4s, #7
1356 shl v10.4s, v10.4s, #25
1357 add v7.4s, v7.4s, v9.4s
1358 orr v10.16b, v10.16b, v12.16b
1359 eor v12.16b, v13.16b, v7.16b
1360 tbl v12.16b, { v12.16b }, v19.16b
1361 add v6.4s, v6.4s, v12.4s
1362 eor v9.16b, v6.16b, v9.16b
1363 ushr v13.4s, v9.4s, #7
1364 shl v9.4s, v9.4s, #25
1365 orr v9.16b, v9.16b, v13.16b
1366 add v13.4s, v15.4s, v24.4s
1367 add v13.4s, v13.4s, v2.4s
1368 eor v3.16b, v3.16b, v13.16b
1369 tbl v3.16b, { v3.16b }, v19.16b
1370 add v8.4s, v8.4s, v3.4s
1371 eor v2.16b, v8.16b, v2.16b
1372 add v17.4s, v17.4s, v30.4s
1373 ushr v15.4s, v2.4s, #7
1374 shl v2.4s, v2.4s, #25
1375 add v17.4s, v17.4s, v10.4s
1376 add v4.4s, v4.4s, v20.4s
1377 orr v2.16b, v2.16b, v15.16b
1378 eor v3.16b, v3.16b, v17.16b
1379 add v4.4s, v4.4s, v9.4s
1380 add v7.4s, v7.4s, v21.4s
1381 tbl v3.16b, { v3.16b }, v16.16b
1382 eor v14.16b, v14.16b, v4.16b
1383 add v7.4s, v7.4s, v2.4s
1384 add v13.4s, v13.4s, v22.4s
1385 mov v28.16b, v26.16b
1386 stur q26, [x29, #-112]
1387 mov v26.16b, v18.16b
1388 mov v18.16b, v24.16b
1389 stur q24, [x29, #-160]
1390 add v6.4s, v6.4s, v3.4s
1391 mov v24.16b, v20.16b
1392 tbl v14.16b, { v14.16b }, v16.16b
1393 eor v11.16b, v11.16b, v7.16b
1394 add v13.4s, v13.4s, v0.4s
1395 ldr q20, [sp, #80]
1396 eor v10.16b, v6.16b, v10.16b
1397 add v8.4s, v8.4s, v14.4s
1398 tbl v11.16b, { v11.16b }, v16.16b
1399 eor v12.16b, v12.16b, v13.16b
1400 stp q30, q22, [x29, #-192]
1401 ushr v15.4s, v10.4s, #12
1402 shl v10.4s, v10.4s, #20
1403 eor v9.16b, v8.16b, v9.16b
1404 add v1.4s, v1.4s, v11.4s
1405 tbl v12.16b, { v12.16b }, v16.16b
1406 mov v30.16b, v27.16b
1407 add v17.4s, v17.4s, v27.4s
1408 ldr q27, [sp, #224]
1409 orr v10.16b, v10.16b, v15.16b
1410 ushr v15.4s, v9.4s, #12
1411 shl v9.4s, v9.4s, #20
1412 eor v2.16b, v1.16b, v2.16b
1413 add v5.4s, v5.4s, v12.4s
1414 orr v9.16b, v9.16b, v15.16b
1415 ushr v15.4s, v2.4s, #12
1416 shl v2.4s, v2.4s, #20
1417 eor v0.16b, v5.16b, v0.16b
1418 add v17.4s, v17.4s, v10.4s
1419 add v4.4s, v4.4s, v20.4s
1420 orr v2.16b, v2.16b, v15.16b
1421 ushr v15.4s, v0.4s, #12
1422 shl v0.4s, v0.4s, #20
1423 eor v3.16b, v3.16b, v17.16b
1424 add v4.4s, v4.4s, v9.4s
1425 add v7.4s, v7.4s, v23.4s
1426 orr v0.16b, v0.16b, v15.16b
1427 tbl v3.16b, { v3.16b }, v19.16b
1428 eor v14.16b, v14.16b, v4.16b
1429 add v7.4s, v7.4s, v2.4s
1430 add v13.4s, v13.4s, v27.4s
1431 add v6.4s, v6.4s, v3.4s
1432 tbl v14.16b, { v14.16b }, v19.16b
1433 eor v11.16b, v11.16b, v7.16b
1434 add v13.4s, v13.4s, v0.4s
1435 eor v10.16b, v6.16b, v10.16b
1436 add v8.4s, v8.4s, v14.4s
1437 tbl v11.16b, { v11.16b }, v19.16b
1438 eor v12.16b, v12.16b, v13.16b
1439 stur q21, [x29, #-144]
1440 ushr v15.4s, v10.4s, #7
1441 shl v10.4s, v10.4s, #25
1442 eor v9.16b, v8.16b, v9.16b
1443 add v1.4s, v1.4s, v11.4s
1444 tbl v12.16b, { v12.16b }, v19.16b
1445 ldur q21, [x29, #-80]
1446 orr v10.16b, v10.16b, v15.16b
1447 ushr v15.4s, v9.4s, #7
1448 shl v9.4s, v9.4s, #25
1449 eor v2.16b, v1.16b, v2.16b
1450 add v5.4s, v5.4s, v12.4s
1451 orr v9.16b, v9.16b, v15.16b
1452 ushr v15.4s, v2.4s, #7
1453 shl v2.4s, v2.4s, #25
1454 eor v0.16b, v5.16b, v0.16b
1455 orr v2.16b, v2.16b, v15.16b
1456 ushr v15.4s, v0.4s, #7
1457 shl v0.4s, v0.4s, #25
1458 orr v0.16b, v0.16b, v15.16b
1459 add v17.4s, v17.4s, v21.4s
1460 add v17.4s, v17.4s, v0.4s
1461 add v4.4s, v4.4s, v26.4s
1462 eor v14.16b, v14.16b, v17.16b
1463 add v4.4s, v4.4s, v10.4s
1464 add v7.4s, v7.4s, v18.4s
1465 tbl v14.16b, { v14.16b }, v16.16b
1466 eor v11.16b, v11.16b, v4.16b
1467 add v7.4s, v7.4s, v9.4s
1468 add v13.4s, v13.4s, v29.4s
1469 add v1.4s, v1.4s, v14.4s
1470 tbl v11.16b, { v11.16b }, v16.16b
1471 eor v12.16b, v12.16b, v7.16b
1472 add v13.4s, v13.4s, v2.4s
1473 eor v0.16b, v0.16b, v1.16b
1474 add v5.4s, v5.4s, v11.4s
1475 tbl v12.16b, { v12.16b }, v16.16b
1476 eor v3.16b, v3.16b, v13.16b
1477 ldur q22, [x29, #-64]
1478 ushr v15.4s, v0.4s, #12
1479 shl v0.4s, v0.4s, #20
1480 eor v10.16b, v5.16b, v10.16b
1481 add v6.4s, v6.4s, v12.4s
1482 tbl v3.16b, { v3.16b }, v16.16b
1483 orr v0.16b, v0.16b, v15.16b
1484 ushr v15.4s, v10.4s, #12
1485 shl v10.4s, v10.4s, #20
1486 eor v9.16b, v6.16b, v9.16b
1487 add v8.4s, v8.4s, v3.4s
1488 add v17.4s, v17.4s, v28.4s
1489 orr v10.16b, v10.16b, v15.16b
1490 ushr v15.4s, v9.4s, #12
1491 shl v9.4s, v9.4s, #20
1492 eor v2.16b, v8.16b, v2.16b
1493 add v17.4s, v17.4s, v0.4s
1494 add v4.4s, v4.4s, v24.4s
1495 orr v9.16b, v9.16b, v15.16b
1496 ushr v15.4s, v2.4s, #12
1497 shl v2.4s, v2.4s, #20
1498 eor v14.16b, v14.16b, v17.16b
1499 add v4.4s, v4.4s, v10.4s
1500 add v7.4s, v7.4s, v22.4s
1501 orr v2.16b, v2.16b, v15.16b
1502 tbl v14.16b, { v14.16b }, v19.16b
1503 eor v11.16b, v11.16b, v4.16b
1504 add v7.4s, v7.4s, v9.4s
1505 add v13.4s, v13.4s, v23.4s
1506 add v1.4s, v1.4s, v14.4s
1507 tbl v11.16b, { v11.16b }, v19.16b
1508 eor v12.16b, v12.16b, v7.16b
1509 add v13.4s, v13.4s, v2.4s
1510 eor v0.16b, v0.16b, v1.16b
1511 add v5.4s, v5.4s, v11.4s
1512 tbl v12.16b, { v12.16b }, v19.16b
1513 eor v3.16b, v3.16b, v13.16b
1514 ldur q22, [x29, #-144]
1515 ushr v15.4s, v0.4s, #7
1516 shl v0.4s, v0.4s, #25
1517 eor v10.16b, v5.16b, v10.16b
1518 add v6.4s, v6.4s, v12.4s
1519 tbl v3.16b, { v3.16b }, v19.16b
1520 orr v0.16b, v0.16b, v15.16b
1521 ushr v15.4s, v10.4s, #7
1522 shl v10.4s, v10.4s, #25
1523 eor v9.16b, v6.16b, v9.16b
1524 add v8.4s, v8.4s, v3.4s
1525 orr v10.16b, v10.16b, v15.16b
1526 ushr v15.4s, v9.4s, #7
1527 shl v9.4s, v9.4s, #25
1528 eor v2.16b, v8.16b, v2.16b
1529 add v17.4s, v17.4s, v31.4s
1530 orr v9.16b, v9.16b, v15.16b
1531 ushr v15.4s, v2.4s, #7
1532 shl v2.4s, v2.4s, #25
1533 add v17.4s, v17.4s, v10.4s
1534 add v4.4s, v4.4s, v22.4s
1535 orr v2.16b, v2.16b, v15.16b
1536 eor v3.16b, v3.16b, v17.16b
1537 add v4.4s, v4.4s, v9.4s
1538 add v7.4s, v7.4s, v30.4s
1539 tbl v3.16b, { v3.16b }, v16.16b
1540 eor v14.16b, v14.16b, v4.16b
1541 add v7.4s, v7.4s, v2.4s
1542 add v13.4s, v13.4s, v27.4s
1543 add v6.4s, v6.4s, v3.4s
1544 tbl v14.16b, { v14.16b }, v16.16b
1545 eor v11.16b, v11.16b, v7.16b
1546 add v13.4s, v13.4s, v0.4s
1547 ldr q27, [sp, #96]
1548 mov v21.16b, v26.16b
1549 stur q26, [x29, #-96]
1550 mov v28.16b, v31.16b
1551 eor v10.16b, v6.16b, v10.16b
1552 add v8.4s, v8.4s, v14.4s
1553 tbl v11.16b, { v11.16b }, v16.16b
1554 eor v12.16b, v12.16b, v13.16b
1555 ldp q31, q26, [x29, #-192]
1556 ushr v15.4s, v10.4s, #12
1557 shl v10.4s, v10.4s, #20
1558 eor v9.16b, v8.16b, v9.16b
1559 add v1.4s, v1.4s, v11.4s
1560 tbl v12.16b, { v12.16b }, v16.16b
1561 orr v10.16b, v10.16b, v15.16b
1562 ushr v15.4s, v9.4s, #12
1563 shl v9.4s, v9.4s, #20
1564 eor v2.16b, v1.16b, v2.16b
1565 add v5.4s, v5.4s, v12.4s
1566 add v17.4s, v17.4s, v20.4s
1567 orr v9.16b, v9.16b, v15.16b
1568 ushr v15.4s, v2.4s, #12
1569 shl v2.4s, v2.4s, #20
1570 eor v0.16b, v5.16b, v0.16b
1571 add v17.4s, v17.4s, v10.4s
1572 add v4.4s, v4.4s, v27.4s
1573 orr v2.16b, v2.16b, v15.16b
1574 ushr v15.4s, v0.4s, #12
1575 shl v0.4s, v0.4s, #20
1576 eor v3.16b, v3.16b, v17.16b
1577 add v4.4s, v4.4s, v9.4s
1578 add v7.4s, v7.4s, v26.4s
1579 orr v0.16b, v0.16b, v15.16b
1580 tbl v3.16b, { v3.16b }, v19.16b
1581 eor v14.16b, v14.16b, v4.16b
1582 add v7.4s, v7.4s, v2.4s
1583 add v13.4s, v13.4s, v31.4s
1584 add v6.4s, v6.4s, v3.4s
1585 tbl v14.16b, { v14.16b }, v19.16b
1586 eor v11.16b, v11.16b, v7.16b
1587 add v13.4s, v13.4s, v0.4s
1588 eor v10.16b, v6.16b, v10.16b
1589 add v8.4s, v8.4s, v14.4s
1590 tbl v11.16b, { v11.16b }, v19.16b
1591 eor v12.16b, v12.16b, v13.16b
1592 ushr v15.4s, v10.4s, #7
1593 shl v10.4s, v10.4s, #25
1594 eor v9.16b, v8.16b, v9.16b
1595 add v1.4s, v1.4s, v11.4s
1596 tbl v12.16b, { v12.16b }, v19.16b
1597 orr v10.16b, v10.16b, v15.16b
1598 ushr v15.4s, v9.4s, #7
1599 shl v9.4s, v9.4s, #25
1600 eor v2.16b, v1.16b, v2.16b
1601 add v5.4s, v5.4s, v12.4s
1602 orr v9.16b, v9.16b, v15.16b
1603 ushr v15.4s, v2.4s, #7
1604 shl v2.4s, v2.4s, #25
1605 eor v0.16b, v5.16b, v0.16b
1606 mov v18.16b, v24.16b
1607 mov v24.16b, v20.16b
1608 orr v2.16b, v2.16b, v15.16b
1609 ushr v15.4s, v0.4s, #7
1610 shl v0.4s, v0.4s, #25
1611 ldur q20, [x29, #-160]
1612 orr v0.16b, v0.16b, v15.16b
1613 add v17.4s, v17.4s, v21.4s
1614 add v17.4s, v17.4s, v0.4s
1615 add v4.4s, v4.4s, v18.4s
1616 eor v14.16b, v14.16b, v17.16b
1617 add v4.4s, v4.4s, v10.4s
1618 add v7.4s, v7.4s, v23.4s
1619 tbl v14.16b, { v14.16b }, v16.16b
1620 eor v11.16b, v11.16b, v4.16b
1621 add v7.4s, v7.4s, v9.4s
1622 add v13.4s, v13.4s, v20.4s
1623 add v1.4s, v1.4s, v14.4s
1624 tbl v11.16b, { v11.16b }, v16.16b
1625 eor v12.16b, v12.16b, v7.16b
1626 add v13.4s, v13.4s, v2.4s
1627 eor v0.16b, v0.16b, v1.16b
1628 add v5.4s, v5.4s, v11.4s
1629 tbl v12.16b, { v12.16b }, v16.16b
1630 eor v3.16b, v3.16b, v13.16b
1631 ldur q25, [x29, #-80]
1632 ushr v15.4s, v0.4s, #12
1633 shl v0.4s, v0.4s, #20
1634 eor v10.16b, v5.16b, v10.16b
1635 add v6.4s, v6.4s, v12.4s
1636 tbl v3.16b, { v3.16b }, v16.16b
1637 orr v0.16b, v0.16b, v15.16b
1638 ushr v15.4s, v10.4s, #12
1639 shl v10.4s, v10.4s, #20
1640 eor v9.16b, v6.16b, v9.16b
1641 add v8.4s, v8.4s, v3.4s
1642 add v17.4s, v17.4s, v29.4s
1643 orr v10.16b, v10.16b, v15.16b
1644 ushr v15.4s, v9.4s, #12
1645 shl v9.4s, v9.4s, #20
1646 eor v2.16b, v8.16b, v2.16b
1647 add v17.4s, v17.4s, v0.4s
1648 add v4.4s, v4.4s, v22.4s
1649 orr v9.16b, v9.16b, v15.16b
1650 ushr v15.4s, v2.4s, #12
1651 shl v2.4s, v2.4s, #20
1652 eor v14.16b, v14.16b, v17.16b
1653 add v4.4s, v4.4s, v10.4s
1654 add v7.4s, v7.4s, v25.4s
1655 orr v2.16b, v2.16b, v15.16b
1656 tbl v14.16b, { v14.16b }, v19.16b
1657 eor v11.16b, v11.16b, v4.16b
1658 add v7.4s, v7.4s, v9.4s
1659 add v13.4s, v13.4s, v26.4s
1660 add v1.4s, v1.4s, v14.4s
1661 tbl v11.16b, { v11.16b }, v19.16b
1662 eor v12.16b, v12.16b, v7.16b
1663 add v13.4s, v13.4s, v2.4s
1664 ldur q25, [x29, #-112]
1665 eor v0.16b, v0.16b, v1.16b
1666 add v5.4s, v5.4s, v11.4s
1667 tbl v12.16b, { v12.16b }, v19.16b
1668 eor v3.16b, v3.16b, v13.16b
1669 ushr v15.4s, v0.4s, #7
1670 shl v0.4s, v0.4s, #25
1671 eor v10.16b, v5.16b, v10.16b
1672 add v6.4s, v6.4s, v12.4s
1673 tbl v3.16b, { v3.16b }, v19.16b
1674 orr v0.16b, v0.16b, v15.16b
1675 ushr v15.4s, v10.4s, #7
1676 shl v10.4s, v10.4s, #25
1677 eor v9.16b, v6.16b, v9.16b
1678 add v8.4s, v8.4s, v3.4s
1679 orr v10.16b, v10.16b, v15.16b
1680 ushr v15.4s, v9.4s, #7
1681 shl v9.4s, v9.4s, #25
1682 eor v2.16b, v8.16b, v2.16b
1683 add v17.4s, v17.4s, v25.4s
1684 orr v9.16b, v9.16b, v15.16b
1685 ushr v15.4s, v2.4s, #7
1686 shl v2.4s, v2.4s, #25
1687 add v17.4s, v17.4s, v10.4s
1688 add v4.4s, v4.4s, v30.4s
1689 orr v2.16b, v2.16b, v15.16b
1690 eor v3.16b, v3.16b, v17.16b
1691 add v4.4s, v4.4s, v9.4s
1692 add v7.4s, v7.4s, v24.4s
1693 tbl v3.16b, { v3.16b }, v16.16b
1694 eor v14.16b, v14.16b, v4.16b
1695 add v7.4s, v7.4s, v2.4s
1696 add v13.4s, v13.4s, v31.4s
1697 add v6.4s, v6.4s, v3.4s
1698 tbl v14.16b, { v14.16b }, v16.16b
1699 eor v11.16b, v11.16b, v7.16b
1700 add v13.4s, v13.4s, v0.4s
1701 ldur q25, [x29, #-64]
1702 eor v10.16b, v6.16b, v10.16b
1703 add v8.4s, v8.4s, v14.4s
1704 tbl v11.16b, { v11.16b }, v16.16b
1705 eor v12.16b, v12.16b, v13.16b
1706 ldr q31, [sp, #224]
1707 ushr v15.4s, v10.4s, #12
1708 shl v10.4s, v10.4s, #20
1709 eor v9.16b, v8.16b, v9.16b
1710 add v1.4s, v1.4s, v11.4s
1711 tbl v12.16b, { v12.16b }, v16.16b
1712 orr v10.16b, v10.16b, v15.16b
1713 ushr v15.4s, v9.4s, #12
1714 shl v9.4s, v9.4s, #20
1715 eor v2.16b, v1.16b, v2.16b
1716 add v5.4s, v5.4s, v12.4s
1717 add v17.4s, v17.4s, v27.4s
1718 orr v9.16b, v9.16b, v15.16b
1719 ushr v15.4s, v2.4s, #12
1720 shl v2.4s, v2.4s, #20
1721 eor v0.16b, v5.16b, v0.16b
1722 add v17.4s, v17.4s, v10.4s
1723 add v4.4s, v4.4s, v25.4s
1724 orr v2.16b, v2.16b, v15.16b
1725 ushr v15.4s, v0.4s, #12
1726 shl v0.4s, v0.4s, #20
1727 eor v3.16b, v3.16b, v17.16b
1728 add v4.4s, v4.4s, v9.4s
1729 add v7.4s, v7.4s, v31.4s
1730 orr v0.16b, v0.16b, v15.16b
1731 tbl v3.16b, { v3.16b }, v19.16b
1732 eor v14.16b, v14.16b, v4.16b
1733 add v7.4s, v7.4s, v2.4s
1734 add v13.4s, v13.4s, v28.4s
1735 add v6.4s, v6.4s, v3.4s
1736 tbl v14.16b, { v14.16b }, v19.16b
1737 eor v11.16b, v11.16b, v7.16b
1738 add v13.4s, v13.4s, v0.4s
1739 eor v10.16b, v6.16b, v10.16b
1740 add v8.4s, v8.4s, v14.4s
1741 tbl v11.16b, { v11.16b }, v19.16b
1742 eor v12.16b, v12.16b, v13.16b
1743 ushr v15.4s, v10.4s, #7
1744 shl v10.4s, v10.4s, #25
1745 eor v9.16b, v8.16b, v9.16b
1746 add v1.4s, v1.4s, v11.4s
1747 tbl v12.16b, { v12.16b }, v19.16b
1748 orr v10.16b, v10.16b, v15.16b
1749 ushr v15.4s, v9.4s, #7
1750 shl v9.4s, v9.4s, #25
1751 eor v2.16b, v1.16b, v2.16b
1752 add v5.4s, v5.4s, v12.4s
1753 orr v9.16b, v9.16b, v15.16b
1754 ushr v15.4s, v2.4s, #7
1755 shl v2.4s, v2.4s, #25
1756 eor v0.16b, v5.16b, v0.16b
1757 orr v2.16b, v2.16b, v15.16b
1758 ushr v15.4s, v0.4s, #7
1759 shl v0.4s, v0.4s, #25
1760 orr v0.16b, v0.16b, v15.16b
1761 add v17.4s, v17.4s, v18.4s
1762 add v17.4s, v17.4s, v0.4s
1763 add v4.4s, v4.4s, v22.4s
1764 eor v14.16b, v14.16b, v17.16b
1765 add v4.4s, v4.4s, v10.4s
1766 add v7.4s, v7.4s, v26.4s
1767 tbl v14.16b, { v14.16b }, v16.16b
1768 eor v11.16b, v11.16b, v4.16b
1769 add v7.4s, v7.4s, v9.4s
1770 add v13.4s, v13.4s, v23.4s
1771 add v1.4s, v1.4s, v14.4s
1772 tbl v11.16b, { v11.16b }, v16.16b
1773 eor v12.16b, v12.16b, v7.16b
1774 add v13.4s, v13.4s, v2.4s
1775 mov v21.16b, v29.16b
1776 stur q29, [x29, #-128]
1777 mov v29.16b, v30.16b
1778 mov v30.16b, v27.16b
1779 mov v27.16b, v18.16b
1780 str q18, [sp, #176]
1781 eor v0.16b, v0.16b, v1.16b
1782 mov v18.16b, v22.16b
1783 add v5.4s, v5.4s, v11.4s
1784 tbl v12.16b, { v12.16b }, v16.16b
1785 eor v3.16b, v3.16b, v13.16b
1786 ldur q22, [x29, #-96]
1787 ushr v15.4s, v0.4s, #12
1788 shl v0.4s, v0.4s, #20
1789 eor v10.16b, v5.16b, v10.16b
1790 add v6.4s, v6.4s, v12.4s
1791 tbl v3.16b, { v3.16b }, v16.16b
1792 orr v0.16b, v0.16b, v15.16b
1793 ushr v15.4s, v10.4s, #12
1794 shl v10.4s, v10.4s, #20
1795 eor v9.16b, v6.16b, v9.16b
1796 add v8.4s, v8.4s, v3.4s
1797 add v17.4s, v17.4s, v20.4s
1798 orr v10.16b, v10.16b, v15.16b
1799 ushr v15.4s, v9.4s, #12
1800 shl v9.4s, v9.4s, #20
1801 eor v2.16b, v8.16b, v2.16b
1802 add v17.4s, v17.4s, v0.4s
1803 add v4.4s, v4.4s, v29.4s
1804 orr v9.16b, v9.16b, v15.16b
1805 ushr v15.4s, v2.4s, #12
1806 shl v2.4s, v2.4s, #20
1807 eor v14.16b, v14.16b, v17.16b
1808 add v4.4s, v4.4s, v10.4s
1809 add v7.4s, v7.4s, v22.4s
1810 orr v2.16b, v2.16b, v15.16b
1811 tbl v14.16b, { v14.16b }, v19.16b
1812 eor v11.16b, v11.16b, v4.16b
1813 add v7.4s, v7.4s, v9.4s
1814 add v13.4s, v13.4s, v31.4s
1815 add v1.4s, v1.4s, v14.4s
1816 tbl v11.16b, { v11.16b }, v19.16b
1817 eor v12.16b, v12.16b, v7.16b
1818 add v13.4s, v13.4s, v2.4s
1819 eor v0.16b, v0.16b, v1.16b
1820 add v5.4s, v5.4s, v11.4s
1821 tbl v12.16b, { v12.16b }, v19.16b
1822 eor v3.16b, v3.16b, v13.16b
1823 ushr v15.4s, v0.4s, #7
1824 shl v0.4s, v0.4s, #25
1825 eor v10.16b, v5.16b, v10.16b
1826 add v6.4s, v6.4s, v12.4s
1827 tbl v3.16b, { v3.16b }, v19.16b
1828 orr v0.16b, v0.16b, v15.16b
1829 ushr v15.4s, v10.4s, #7
1830 shl v10.4s, v10.4s, #25
1831 eor v9.16b, v6.16b, v9.16b
1832 add v8.4s, v8.4s, v3.4s
1833 orr v10.16b, v10.16b, v15.16b
1834 ushr v15.4s, v9.4s, #7
1835 shl v9.4s, v9.4s, #25
1836 eor v2.16b, v8.16b, v2.16b
1837 add v17.4s, v17.4s, v21.4s
1838 orr v9.16b, v9.16b, v15.16b
1839 ushr v15.4s, v2.4s, #7
1840 shl v2.4s, v2.4s, #25
1841 add v17.4s, v17.4s, v10.4s
1842 add v4.4s, v4.4s, v24.4s
1843 orr v2.16b, v2.16b, v15.16b
1844 eor v3.16b, v3.16b, v17.16b
1845 add v4.4s, v4.4s, v9.4s
1846 add v7.4s, v7.4s, v30.4s
1847 tbl v3.16b, { v3.16b }, v16.16b
1848 eor v14.16b, v14.16b, v4.16b
1849 add v7.4s, v7.4s, v2.4s
1850 add v13.4s, v13.4s, v28.4s
1851 add v6.4s, v6.4s, v3.4s
1852 mov v22.16b, v24.16b
1853 tbl v14.16b, { v14.16b }, v16.16b
1854 eor v11.16b, v11.16b, v7.16b
1855 add v13.4s, v13.4s, v0.4s
1856 ldur q24, [x29, #-80]
1857 eor v10.16b, v6.16b, v10.16b
1858 add v8.4s, v8.4s, v14.4s
1859 mov v21.16b, v30.16b
1860 tbl v11.16b, { v11.16b }, v16.16b
1861 eor v12.16b, v12.16b, v13.16b
1862 ldur q30, [x29, #-192]
1863 mov v20.16b, v29.16b
1864 ushr v15.4s, v10.4s, #12
1865 shl v10.4s, v10.4s, #20
1866 eor v9.16b, v8.16b, v9.16b
1867 add v1.4s, v1.4s, v11.4s
1868 tbl v12.16b, { v12.16b }, v16.16b
1869 ldur q29, [x29, #-112]
1870 orr v10.16b, v10.16b, v15.16b
1871 ushr v15.4s, v9.4s, #12
1872 shl v9.4s, v9.4s, #20
1873 eor v2.16b, v1.16b, v2.16b
1874 add v5.4s, v5.4s, v12.4s
1875 add v17.4s, v17.4s, v25.4s
1876 orr v9.16b, v9.16b, v15.16b
1877 ushr v15.4s, v2.4s, #12
1878 shl v2.4s, v2.4s, #20
1879 eor v0.16b, v5.16b, v0.16b
1880 add v17.4s, v17.4s, v10.4s
1881 add v4.4s, v4.4s, v24.4s
1882 orr v2.16b, v2.16b, v15.16b
1883 ushr v15.4s, v0.4s, #12
1884 shl v0.4s, v0.4s, #20
1885 eor v3.16b, v3.16b, v17.16b
1886 add v4.4s, v4.4s, v9.4s
1887 add v7.4s, v7.4s, v30.4s
1888 orr v0.16b, v0.16b, v15.16b
1889 tbl v3.16b, { v3.16b }, v19.16b
1890 eor v14.16b, v14.16b, v4.16b
1891 add v7.4s, v7.4s, v2.4s
1892 add v13.4s, v13.4s, v29.4s
1893 add v6.4s, v6.4s, v3.4s
1894 tbl v14.16b, { v14.16b }, v19.16b
1895 eor v11.16b, v11.16b, v7.16b
1896 add v13.4s, v13.4s, v0.4s
1897 eor v10.16b, v6.16b, v10.16b
1898 add v8.4s, v8.4s, v14.4s
1899 tbl v11.16b, { v11.16b }, v19.16b
1900 eor v12.16b, v12.16b, v13.16b
1901 ushr v15.4s, v10.4s, #7
1902 shl v10.4s, v10.4s, #25
1903 eor v9.16b, v8.16b, v9.16b
1904 add v1.4s, v1.4s, v11.4s
1905 tbl v12.16b, { v12.16b }, v19.16b
1906 orr v10.16b, v10.16b, v15.16b
1907 ushr v15.4s, v9.4s, #7
1908 shl v9.4s, v9.4s, #25
1909 eor v2.16b, v1.16b, v2.16b
1910 add v5.4s, v5.4s, v12.4s
1911 orr v9.16b, v9.16b, v15.16b
1912 ushr v15.4s, v2.4s, #7
1913 shl v2.4s, v2.4s, #25
1914 eor v0.16b, v5.16b, v0.16b
1915 orr v2.16b, v2.16b, v15.16b
1916 ushr v15.4s, v0.4s, #7
1917 shl v0.4s, v0.4s, #25
1918 orr v0.16b, v0.16b, v15.16b
1919 add v17.4s, v17.4s, v18.4s
1920 add v17.4s, v17.4s, v0.4s
1921 add v4.4s, v4.4s, v20.4s
1922 eor v14.16b, v14.16b, v17.16b
1923 add v4.4s, v4.4s, v10.4s
1924 add v7.4s, v7.4s, v31.4s
1925 tbl v14.16b, { v14.16b }, v16.16b
1926 eor v11.16b, v11.16b, v4.16b
1927 add v7.4s, v7.4s, v9.4s
1928 add v13.4s, v13.4s, v26.4s
1929 add v1.4s, v1.4s, v14.4s
1930 tbl v11.16b, { v11.16b }, v16.16b
1931 eor v12.16b, v12.16b, v7.16b
1932 add v13.4s, v13.4s, v2.4s
1933 eor v0.16b, v0.16b, v1.16b
1934 add v5.4s, v5.4s, v11.4s
1935 tbl v12.16b, { v12.16b }, v16.16b
1936 eor v3.16b, v3.16b, v13.16b
1937 ushr v15.4s, v0.4s, #12
1938 shl v0.4s, v0.4s, #20
1939 eor v10.16b, v5.16b, v10.16b
1940 add v6.4s, v6.4s, v12.4s
1941 tbl v3.16b, { v3.16b }, v16.16b
1942 orr v0.16b, v0.16b, v15.16b
1943 ushr v15.4s, v10.4s, #12
1944 shl v10.4s, v10.4s, #20
1945 eor v9.16b, v6.16b, v9.16b
1946 add v8.4s, v8.4s, v3.4s
1947 add v17.4s, v17.4s, v23.4s
1948 orr v10.16b, v10.16b, v15.16b
1949 ushr v15.4s, v9.4s, #12
1950 shl v9.4s, v9.4s, #20
1951 eor v2.16b, v8.16b, v2.16b
1952 add v17.4s, v17.4s, v0.4s
1953 add v4.4s, v4.4s, v22.4s
1954 orr v9.16b, v9.16b, v15.16b
1955 ushr v15.4s, v2.4s, #12
1956 shl v2.4s, v2.4s, #20
1957 eor v14.16b, v14.16b, v17.16b
1958 add v4.4s, v4.4s, v10.4s
1959 add v7.4s, v7.4s, v27.4s
1960 orr v2.16b, v2.16b, v15.16b
1961 tbl v14.16b, { v14.16b }, v19.16b
1962 eor v11.16b, v11.16b, v4.16b
1963 add v7.4s, v7.4s, v9.4s
1964 add v13.4s, v13.4s, v30.4s
1965 add v1.4s, v1.4s, v14.4s
1966 tbl v11.16b, { v11.16b }, v19.16b
1967 eor v12.16b, v12.16b, v7.16b
1968 add v13.4s, v13.4s, v2.4s
1969 ldur q27, [x29, #-160]
1970 eor v0.16b, v0.16b, v1.16b
1971 add v5.4s, v5.4s, v11.4s
1972 tbl v12.16b, { v12.16b }, v19.16b
1973 eor v3.16b, v3.16b, v13.16b
1974 ushr v15.4s, v0.4s, #7
1975 shl v0.4s, v0.4s, #25
1976 eor v10.16b, v5.16b, v10.16b
1977 add v6.4s, v6.4s, v12.4s
1978 tbl v3.16b, { v3.16b }, v19.16b
1979 orr v0.16b, v0.16b, v15.16b
1980 ushr v15.4s, v10.4s, #7
1981 shl v10.4s, v10.4s, #25
1982 eor v9.16b, v6.16b, v9.16b
1983 add v8.4s, v8.4s, v3.4s
1984 orr v10.16b, v10.16b, v15.16b
1985 ushr v15.4s, v9.4s, #7
1986 shl v9.4s, v9.4s, #25
1987 eor v2.16b, v8.16b, v2.16b
1988 add v17.4s, v17.4s, v27.4s
1989 mov v28.16b, v25.16b
1990 orr v9.16b, v9.16b, v15.16b
1991 ushr v15.4s, v2.4s, #7
1992 shl v2.4s, v2.4s, #25
1993 add v17.4s, v17.4s, v10.4s
1994 add v4.4s, v4.4s, v21.4s
1995 orr v2.16b, v2.16b, v15.16b
1996 eor v3.16b, v3.16b, v17.16b
1997 add v4.4s, v4.4s, v9.4s
1998 add v7.4s, v7.4s, v28.4s
1999 tbl v3.16b, { v3.16b }, v16.16b
2000 eor v14.16b, v14.16b, v4.16b
2001 add v7.4s, v7.4s, v2.4s
2002 add v13.4s, v13.4s, v29.4s
2003 mov v25.16b, v31.16b
2004 add v6.4s, v6.4s, v3.4s
2005 tbl v14.16b, { v14.16b }, v16.16b
2006 eor v11.16b, v11.16b, v7.16b
2007 add v13.4s, v13.4s, v0.4s
2008 ldur q31, [x29, #-96]
2009 eor v10.16b, v6.16b, v10.16b
2010 add v8.4s, v8.4s, v14.4s
2011 tbl v11.16b, { v11.16b }, v16.16b
2012 eor v12.16b, v12.16b, v13.16b
2013 ldur q28, [x29, #-208]
2014 mov v18.16b, v20.16b
2015 str q20, [sp, #144]
2016 ushr v15.4s, v10.4s, #12
2017 shl v10.4s, v10.4s, #20
2018 eor v9.16b, v8.16b, v9.16b
2019 add v1.4s, v1.4s, v11.4s
2020 tbl v12.16b, { v12.16b }, v16.16b
2021 ldur q20, [x29, #-128]
2022 orr v10.16b, v10.16b, v15.16b
2023 ushr v15.4s, v9.4s, #12
2024 shl v9.4s, v9.4s, #20
2025 eor v2.16b, v1.16b, v2.16b
2026 add v5.4s, v5.4s, v12.4s
2027 add v17.4s, v17.4s, v24.4s
2028 orr v9.16b, v9.16b, v15.16b
2029 ushr v15.4s, v2.4s, #12
2030 shl v2.4s, v2.4s, #20
2031 eor v0.16b, v5.16b, v0.16b
2032 add v17.4s, v17.4s, v10.4s
2033 add v4.4s, v4.4s, v31.4s
2034 orr v2.16b, v2.16b, v15.16b
2035 ushr v15.4s, v0.4s, #12
2036 shl v0.4s, v0.4s, #20
2037 eor v3.16b, v3.16b, v17.16b
2038 add v4.4s, v4.4s, v9.4s
2039 add v7.4s, v7.4s, v28.4s
2040 orr v0.16b, v0.16b, v15.16b
2041 tbl v3.16b, { v3.16b }, v19.16b
2042 eor v14.16b, v14.16b, v4.16b
2043 add v7.4s, v7.4s, v2.4s
2044 add v13.4s, v13.4s, v20.4s
2045 add v6.4s, v6.4s, v3.4s
2046 tbl v14.16b, { v14.16b }, v19.16b
2047 eor v11.16b, v11.16b, v7.16b
2048 add v13.4s, v13.4s, v0.4s
2049 eor v10.16b, v6.16b, v10.16b
2050 add v8.4s, v8.4s, v14.4s
2051 tbl v11.16b, { v11.16b }, v19.16b
2052 eor v12.16b, v12.16b, v13.16b
2053 ushr v15.4s, v10.4s, #7
2054 shl v10.4s, v10.4s, #25
2055 eor v9.16b, v8.16b, v9.16b
2056 add v1.4s, v1.4s, v11.4s
2057 tbl v12.16b, { v12.16b }, v19.16b
2058 orr v10.16b, v10.16b, v15.16b
2059 ushr v15.4s, v9.4s, #7
2060 shl v9.4s, v9.4s, #25
2061 eor v2.16b, v1.16b, v2.16b
2062 add v5.4s, v5.4s, v12.4s
2063 orr v9.16b, v9.16b, v15.16b
2064 ushr v15.4s, v2.4s, #7
2065 shl v2.4s, v2.4s, #25
2066 eor v0.16b, v5.16b, v0.16b
2067 orr v2.16b, v2.16b, v15.16b
2068 ushr v15.4s, v0.4s, #7
2069 shl v0.4s, v0.4s, #25
2070 orr v0.16b, v0.16b, v15.16b
2071 add v17.4s, v17.4s, v18.4s
2072 add v17.4s, v17.4s, v0.4s
2073 add v4.4s, v4.4s, v22.4s
2074 eor v14.16b, v14.16b, v17.16b
2075 add v4.4s, v4.4s, v10.4s
2076 add v7.4s, v7.4s, v30.4s
2077 tbl v14.16b, { v14.16b }, v16.16b
2078 eor v11.16b, v11.16b, v4.16b
2079 add v7.4s, v7.4s, v9.4s
2080 add v13.4s, v13.4s, v25.4s
2081 add v1.4s, v1.4s, v14.4s
2082 tbl v11.16b, { v11.16b }, v16.16b
2083 eor v12.16b, v12.16b, v7.16b
2084 add v13.4s, v13.4s, v2.4s
2085 eor v0.16b, v0.16b, v1.16b
2086 add v5.4s, v5.4s, v11.4s
2087 tbl v12.16b, { v12.16b }, v16.16b
2088 eor v3.16b, v3.16b, v13.16b
2089 add v17.4s, v17.4s, v26.4s
2090 mov v26.16b, v21.16b
2091 add v4.4s, v4.4s, v21.4s
2092 ldur q21, [x29, #-144]
2093 ushr v15.4s, v0.4s, #12
2094 shl v0.4s, v0.4s, #20
2095 eor v10.16b, v5.16b, v10.16b
2096 add v6.4s, v6.4s, v12.4s
2097 tbl v3.16b, { v3.16b }, v16.16b
2098 orr v0.16b, v0.16b, v15.16b
2099 ushr v15.4s, v10.4s, #12
2100 shl v10.4s, v10.4s, #20
2101 eor v9.16b, v6.16b, v9.16b
2102 add v8.4s, v8.4s, v3.4s
2103 orr v10.16b, v10.16b, v15.16b
2104 ushr v15.4s, v9.4s, #12
2105 shl v9.4s, v9.4s, #20
2106 eor v2.16b, v8.16b, v2.16b
2107 add v17.4s, v17.4s, v0.4s
2108 orr v9.16b, v9.16b, v15.16b
2109 ushr v15.4s, v2.4s, #12
2110 shl v2.4s, v2.4s, #20
2111 eor v14.16b, v14.16b, v17.16b
2112 add v4.4s, v4.4s, v10.4s
2113 add v7.4s, v7.4s, v21.4s
2114 orr v2.16b, v2.16b, v15.16b
2115 tbl v14.16b, { v14.16b }, v19.16b
2116 eor v11.16b, v11.16b, v4.16b
2117 add v7.4s, v7.4s, v9.4s
2118 add v13.4s, v13.4s, v28.4s
2119 add v1.4s, v1.4s, v14.4s
2120 tbl v11.16b, { v11.16b }, v19.16b
2121 eor v12.16b, v12.16b, v7.16b
2122 add v13.4s, v13.4s, v2.4s
2123 str q23, [sp, #160]
2124 eor v0.16b, v0.16b, v1.16b
2125 add v5.4s, v5.4s, v11.4s
2126 tbl v12.16b, { v12.16b }, v19.16b
2127 eor v3.16b, v3.16b, v13.16b
2128 add v17.4s, v17.4s, v23.4s
2129 ldur q23, [x29, #-64]
2130 ushr v15.4s, v0.4s, #7
2131 shl v0.4s, v0.4s, #25
2132 eor v10.16b, v5.16b, v10.16b
2133 add v6.4s, v6.4s, v12.4s
2134 tbl v3.16b, { v3.16b }, v19.16b
2135 orr v0.16b, v0.16b, v15.16b
2136 ushr v15.4s, v10.4s, #7
2137 shl v10.4s, v10.4s, #25
2138 eor v9.16b, v6.16b, v9.16b
2139 add v8.4s, v8.4s, v3.4s
2140 orr v10.16b, v10.16b, v15.16b
2141 ushr v15.4s, v9.4s, #7
2142 shl v9.4s, v9.4s, #25
2143 eor v2.16b, v8.16b, v2.16b
2144 orr v9.16b, v9.16b, v15.16b
2145 ushr v15.4s, v2.4s, #7
2146 shl v2.4s, v2.4s, #25
2147 add v17.4s, v17.4s, v10.4s
2148 add v4.4s, v4.4s, v23.4s
2149 orr v2.16b, v2.16b, v15.16b
2150 eor v3.16b, v3.16b, v17.16b
2151 add v4.4s, v4.4s, v9.4s
2152 add v7.4s, v7.4s, v24.4s
2153 tbl v3.16b, { v3.16b }, v16.16b
2154 eor v14.16b, v14.16b, v4.16b
2155 add v7.4s, v7.4s, v2.4s
2156 add v6.4s, v6.4s, v3.4s
2157 tbl v14.16b, { v14.16b }, v16.16b
2158 eor v11.16b, v11.16b, v7.16b
2159 add v13.4s, v13.4s, v20.4s
2160 eor v10.16b, v6.16b, v10.16b
2161 add v8.4s, v8.4s, v14.4s
2162 tbl v11.16b, { v11.16b }, v16.16b
2163 add v13.4s, v13.4s, v0.4s
2164 ldr q20, [sp, #176]
2165 ushr v15.4s, v10.4s, #12
2166 shl v10.4s, v10.4s, #20
2167 eor v9.16b, v8.16b, v9.16b
2168 add v1.4s, v1.4s, v11.4s
2169 eor v12.16b, v12.16b, v13.16b
2170 orr v10.16b, v10.16b, v15.16b
2171 ushr v15.4s, v9.4s, #12
2172 shl v9.4s, v9.4s, #20
2173 eor v2.16b, v1.16b, v2.16b
2174 tbl v12.16b, { v12.16b }, v16.16b
2175 orr v9.16b, v9.16b, v15.16b
2176 ushr v15.4s, v2.4s, #12
2177 shl v2.4s, v2.4s, #20
2178 add v5.4s, v5.4s, v12.4s
2179 add v17.4s, v17.4s, v31.4s
2180 orr v2.16b, v2.16b, v15.16b
2181 eor v0.16b, v5.16b, v0.16b
2182 add v17.4s, v17.4s, v10.4s
2183 add v4.4s, v4.4s, v20.4s
2184 add v7.4s, v7.4s, v29.4s
2185 ushr v15.4s, v0.4s, #12
2186 shl v0.4s, v0.4s, #20
2187 eor v3.16b, v3.16b, v17.16b
2188 add v4.4s, v4.4s, v9.4s
2189 add v7.4s, v7.4s, v2.4s
2190 orr v0.16b, v0.16b, v15.16b
2191 mov v15.16b, v31.16b
2192 add v17.4s, v17.4s, v22.4s
2193 eor v31.16b, v14.16b, v4.16b
2194 eor v22.16b, v11.16b, v7.16b
2195 add v11.4s, v13.4s, v27.4s
2196 tbl v3.16b, { v3.16b }, v19.16b
2197 add v11.4s, v11.4s, v0.4s
2198 tbl v31.16b, { v31.16b }, v19.16b
2199 add v6.4s, v6.4s, v3.4s
2200 eor v12.16b, v12.16b, v11.16b
2201 tbl v22.16b, { v22.16b }, v19.16b
2202 add v8.4s, v8.4s, v31.4s
2203 eor v10.16b, v6.16b, v10.16b
2204 add v30.4s, v11.4s, v30.4s
2205 tbl v11.16b, { v12.16b }, v19.16b
2206 add v1.4s, v1.4s, v22.4s
2207 eor v9.16b, v8.16b, v9.16b
2208 ushr v12.4s, v10.4s, #7
2209 shl v10.4s, v10.4s, #25
2210 add v5.4s, v5.4s, v11.4s
2211 eor v2.16b, v1.16b, v2.16b
2212 orr v10.16b, v10.16b, v12.16b
2213 ushr v12.4s, v9.4s, #7
2214 shl v9.4s, v9.4s, #25
2215 eor v0.16b, v5.16b, v0.16b
2216 orr v9.16b, v9.16b, v12.16b
2217 ushr v12.4s, v2.4s, #7
2218 shl v2.4s, v2.4s, #25
2219 orr v2.16b, v2.16b, v12.16b
2220 ushr v12.4s, v0.4s, #7
2221 shl v0.4s, v0.4s, #25
2222 orr v0.16b, v0.16b, v12.16b
2223 add v4.4s, v4.4s, v26.4s
2224 add v17.4s, v17.4s, v0.4s
2225 add v7.4s, v7.4s, v28.4s
2226 mov v18.16b, v27.16b
2227 eor v31.16b, v31.16b, v17.16b
2228 add v4.4s, v4.4s, v10.4s
2229 add v27.4s, v30.4s, v2.4s
2230 eor v22.16b, v22.16b, v4.16b
2231 add v7.4s, v7.4s, v9.4s
2232 eor v3.16b, v3.16b, v27.16b
2233 add v26.4s, v27.4s, v29.4s
2234 tbl v27.16b, { v31.16b }, v16.16b
2235 eor v28.16b, v11.16b, v7.16b
2236 tbl v22.16b, { v22.16b }, v16.16b
2237 add v1.4s, v1.4s, v27.4s
2238 add v4.4s, v4.4s, v23.4s
2239 ldr q23, [sp, #144]
2240 tbl v28.16b, { v28.16b }, v16.16b
2241 tbl v3.16b, { v3.16b }, v16.16b
2242 add v5.4s, v5.4s, v22.4s
2243 eor v0.16b, v0.16b, v1.16b
2244 add v6.4s, v6.4s, v28.4s
2245 add v29.4s, v8.4s, v3.4s
2246 eor v30.16b, v5.16b, v10.16b
2247 ushr v8.4s, v0.4s, #12
2248 shl v0.4s, v0.4s, #20
2249 eor v31.16b, v6.16b, v9.16b
2250 orr v0.16b, v0.16b, v8.16b
2251 ushr v8.4s, v30.4s, #12
2252 shl v30.4s, v30.4s, #20
2253 eor v2.16b, v29.16b, v2.16b
2254 orr v30.16b, v30.16b, v8.16b
2255 ushr v8.4s, v31.4s, #12
2256 shl v31.4s, v31.4s, #20
2257 add v17.4s, v17.4s, v25.4s
2258 add v7.4s, v7.4s, v23.4s
2259 orr v31.16b, v31.16b, v8.16b
2260 ushr v8.4s, v2.4s, #12
2261 shl v2.4s, v2.4s, #20
2262 ldur q23, [x29, #-176]
2263 orr v2.16b, v2.16b, v8.16b
2264 add v17.4s, v17.4s, v0.4s
2265 eor v27.16b, v27.16b, v17.16b
2266 add v4.4s, v4.4s, v30.4s
2267 add v25.4s, v26.4s, v2.4s
2268 eor v22.16b, v22.16b, v4.16b
2269 add v4.4s, v4.4s, v24.4s
2270 add v7.4s, v7.4s, v31.4s
2271 eor v3.16b, v3.16b, v25.16b
2272 add v24.4s, v25.4s, v18.4s
2273 tbl v25.16b, { v27.16b }, v19.16b
2274 add v17.4s, v17.4s, v23.4s
2275 eor v23.16b, v28.16b, v7.16b
2276 tbl v22.16b, { v22.16b }, v19.16b
2277 add v1.4s, v1.4s, v25.4s
2278 tbl v23.16b, { v23.16b }, v19.16b
2279 tbl v3.16b, { v3.16b }, v19.16b
2280 add v5.4s, v5.4s, v22.4s
2281 eor v0.16b, v0.16b, v1.16b
2282 add v6.4s, v6.4s, v23.4s
2283 add v26.4s, v29.4s, v3.4s
2284 eor v27.16b, v5.16b, v30.16b
2285 ushr v29.4s, v0.4s, #7
2286 shl v0.4s, v0.4s, #25
2287 eor v28.16b, v6.16b, v31.16b
2288 orr v0.16b, v0.16b, v29.16b
2289 ushr v29.4s, v27.4s, #7
2290 shl v27.4s, v27.4s, #25
2291 eor v2.16b, v26.16b, v2.16b
2292 orr v27.16b, v27.16b, v29.16b
2293 ushr v29.4s, v28.4s, #7
2294 shl v28.4s, v28.4s, #25
2295 ldur q18, [x29, #-128]
2296 orr v28.16b, v28.16b, v29.16b
2297 ushr v29.4s, v2.4s, #7
2298 shl v2.4s, v2.4s, #25
2299 add v7.4s, v7.4s, v15.4s
2300 orr v2.16b, v2.16b, v29.16b
2301 add v17.4s, v17.4s, v27.4s
2302 add v4.4s, v4.4s, v28.4s
2303 add v7.4s, v7.4s, v2.4s
2304 eor v3.16b, v3.16b, v17.16b
2305 add v17.4s, v17.4s, v20.4s
2306 eor v20.16b, v25.16b, v4.16b
2307 add v4.4s, v4.4s, v21.4s
2308 eor v21.16b, v22.16b, v7.16b
2309 add v7.4s, v7.4s, v18.4s
2310 add v18.4s, v24.4s, v0.4s
2311 eor v22.16b, v23.16b, v18.16b
2312 ldr q23, [sp, #160]
2313 tbl v3.16b, { v3.16b }, v16.16b
2314 tbl v20.16b, { v20.16b }, v16.16b
2315 add v6.4s, v6.4s, v3.4s
2316 add v18.4s, v18.4s, v23.4s
2317 tbl v21.16b, { v21.16b }, v16.16b
2318 tbl v16.16b, { v22.16b }, v16.16b
2319 add v22.4s, v26.4s, v20.4s
2320 eor v23.16b, v6.16b, v27.16b
2321 add v1.4s, v1.4s, v21.4s
2322 eor v24.16b, v22.16b, v28.16b
2323 ushr v25.4s, v23.4s, #12
2324 shl v23.4s, v23.4s, #20
2325 add v5.4s, v5.4s, v16.4s
2326 eor v2.16b, v1.16b, v2.16b
2327 orr v23.16b, v23.16b, v25.16b
2328 ushr v25.4s, v24.4s, #12
2329 shl v24.4s, v24.4s, #20
2330 eor v0.16b, v5.16b, v0.16b
2331 orr v24.16b, v24.16b, v25.16b
2332 ushr v25.4s, v2.4s, #12
2333 shl v2.4s, v2.4s, #20
2334 orr v2.16b, v2.16b, v25.16b
2335 ushr v25.4s, v0.4s, #12
2336 shl v0.4s, v0.4s, #20
2337 orr v0.16b, v0.16b, v25.16b
2338 add v25.4s, v7.4s, v2.4s
2339 add v26.4s, v18.4s, v0.4s
2340 eor v18.16b, v21.16b, v25.16b
2341 add v17.4s, v17.4s, v23.4s
2342 add v4.4s, v4.4s, v24.4s
2343 eor v16.16b, v16.16b, v26.16b
2344 tbl v21.16b, { v18.16b }, v19.16b
2345 eor v3.16b, v3.16b, v17.16b
2346 eor v7.16b, v20.16b, v4.16b
2347 tbl v16.16b, { v16.16b }, v19.16b
2348 add v1.4s, v1.4s, v21.4s
2349 tbl v3.16b, { v3.16b }, v19.16b
2350 tbl v20.16b, { v7.16b }, v19.16b
2351 eor v2.16b, v1.16b, v2.16b
2352 eor v7.16b, v1.16b, v17.16b
2353 add v1.4s, v5.4s, v16.4s
2354 eor v0.16b, v1.16b, v0.16b
2355 eor v18.16b, v1.16b, v4.16b
2356 add v1.4s, v6.4s, v3.4s
2357 eor v4.16b, v1.16b, v23.16b
2358 eor v6.16b, v25.16b, v1.16b
2359 add v1.4s, v22.4s, v20.4s
2360 eor v5.16b, v1.16b, v24.16b
2361 eor v17.16b, v26.16b, v1.16b
2362 ushr v1.4s, v4.4s, #7
2363 shl v4.4s, v4.4s, #25
2364 orr v1.16b, v4.16b, v1.16b
2365 ushr v4.4s, v5.4s, #7
2366 shl v5.4s, v5.4s, #25
2367 orr v4.16b, v5.16b, v4.16b
2368 ushr v5.4s, v2.4s, #7
2369 shl v2.4s, v2.4s, #25
2370 orr v2.16b, v2.16b, v5.16b
2371 ushr v5.4s, v0.4s, #7
2372 shl v0.4s, v0.4s, #25
2373 orr v0.16b, v0.16b, v5.16b
2374 eor v10.16b, v0.16b, v20.16b
2375 eor v11.16b, v1.16b, v21.16b
2376 eor v19.16b, v4.16b, v16.16b
2377 cmp x0, x22
2378 eor v16.16b, v2.16b, v3.16b
2379 mov w6, w19
2380 b.ne .LBB2_4
2381 .LBB2_7:
2382 zip1 v0.4s, v7.4s, v18.4s
2383 zip2 v1.4s, v7.4s, v18.4s
2384 zip1 v2.4s, v6.4s, v17.4s
2385 zip2 v3.4s, v6.4s, v17.4s
2386 zip1 v4.4s, v10.4s, v11.4s
2387 zip2 v5.4s, v10.4s, v11.4s
2388 zip1 v6.4s, v19.4s, v16.4s
2389 zip2 v7.4s, v19.4s, v16.4s
2390 add x15, x20, #4
2391 tst w5, #0x1
2392 sub x28, x28, #4
2393 zip1 v16.2d, v0.2d, v2.2d
2394 zip2 v0.2d, v0.2d, v2.2d
2395 zip1 v2.2d, v1.2d, v3.2d
2396 zip2 v1.2d, v1.2d, v3.2d
2397 zip1 v3.2d, v4.2d, v6.2d
2398 zip2 v4.2d, v4.2d, v6.2d
2399 zip1 v6.2d, v5.2d, v7.2d
2400 zip2 v5.2d, v5.2d, v7.2d
2401 add x24, x24, #32
2402 csel x20, x15, x20, ne
2403 cmp x28, #3
2404 stp q16, q3, [x26]
2405 stp q0, q4, [x26, #32]
2406 stp q2, q6, [x26, #64]
2407 stp q1, q5, [x26, #96]
2408 add x26, x26, #128
2409 b.hi .LBB2_2
2410 .LBB2_8:
2411 cbz x28, .LBB2_16
2412 orr w8, w7, w19
2413 and x21, x5, #0x1
2414 stur w8, [x29, #-64]
2415 .LBB2_10:
2416 ldr x8, [sp, #40]
2417 ldr x25, [x24]
2418 ldur w4, [x29, #-64]
2419 ldp q1, q0, [x8]
2420 mov x8, x22
2421 stp q1, q0, [x29, #-48]
2422 .LBB2_11:
2423 subs x23, x8, #1
2424 b.eq .LBB2_13
2425 cbnz x8, .LBB2_14
2426 b .LBB2_15
2427 .LBB2_13:
2428 orr w4, w4, w27
2429 .LBB2_14:
2430 sub x0, x29, #48
2431 mov w2, #64
2432 mov x1, x25
2433 mov x3, x20
2434 bl zfs_blake3_compress_in_place_sse41
2435 add x25, x25, #64
2436 mov x8, x23
2437 mov w4, w19
2438 b .LBB2_11
2439 .LBB2_15:
2440 ldp q0, q1, [x29, #-48]
2441 add x20, x20, x21
2442 add x24, x24, #8
2443 subs x28, x28, #1
2444 stp q0, q1, [x26], #32
2445 b.ne .LBB2_10
2446 .LBB2_16:
2447 add sp, sp, #448
2448 ldp x20, x19, [sp, #144]
2449 ldp x22, x21, [sp, #128]
2450 ldp x24, x23, [sp, #112]
2451 ldp x26, x25, [sp, #96]
2452 ldp x28, x27, [sp, #80]
2453 ldp x29, x30, [sp, #64]
2454 ldp d9, d8, [sp, #48]
2455 ldp d11, d10, [sp, #32]
2456 ldp d13, d12, [sp, #16]
2457 ldp d15, d14, [sp], #160
2458 ret
2459 .Lfunc_end2:
2460 .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
2461 .cfi_endproc
2462 .section ".note.GNU-stack","",@progbits
2463 #endif