/*
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
* Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
- * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de>
*
* This is converted assembly: SSE2 -> ARMv8-A
* Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
*/
#if defined(__aarch64__)
.text
- .section .rodata.cst16,"aM",@progbits,16
- .p2align 4
-.LCPI0_0:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI0_1:
- .xword 0
- .xword -4294967296
-.LCPI0_2:
- .xword -1
- .xword 4294967295
+ .section .note.gnu.property,"a",@note
+ .p2align 3
+ .word 4
+ .word 16
+ .word 5
+ .asciz "GNU"
+ .word 3221225472
+ .word 4
+ .word 3
+ .word 0
+.Lsec_end0:
.text
.globl zfs_blake3_compress_in_place_sse2
.p2align 2
.type zfs_blake3_compress_in_place_sse2,@function
zfs_blake3_compress_in_place_sse2:
.cfi_startproc
- ldp q3, q2, [x0]
- ldp q5, q6, [x1]
- add x10, x1, #32
- lsr x11, x3, #32
- fmov s4, w3
- ld2 { v17.4s, v18.4s }, [x10]
- adrp x10, .LCPI0_2
- and w8, w2, #0xff
- mov v4.s[1], w11
- ldr q1, [x10, :lo12:.LCPI0_2]
- and w9, w4, #0xff
- adrp x12, .LCPI0_0
- mov v4.s[2], w8
- uzp1 v19.4s, v5.4s, v6.4s
- add v3.4s, v2.4s, v3.4s
- ldr q7, [x12, :lo12:.LCPI0_0]
- mov v4.s[3], w9
- add v3.4s, v3.4s, v19.4s
- uzp2 v5.4s, v5.4s, v6.4s
- ext v21.16b, v18.16b, v18.16b, #12
- uzp1 v6.4s, v19.4s, v19.4s
- ext v22.16b, v19.16b, v19.16b, #12
- eor v4.16b, v3.16b, v4.16b
- ext v20.16b, v17.16b, v17.16b, #12
- ext v6.16b, v6.16b, v19.16b, #8
- ext v19.16b, v19.16b, v22.16b, #12
- zip1 v22.2d, v21.2d, v5.2d
- rev32 v24.8h, v4.8h
- mov v4.16b, v1.16b
- zip2 v23.4s, v5.4s, v21.4s
- uzp2 v6.4s, v6.4s, v5.4s
- bsl v4.16b, v22.16b, v20.16b
- add v3.4s, v3.4s, v5.4s
- zip1 v5.4s, v23.4s, v20.4s
- zip1 v22.4s, v20.4s, v23.4s
- add v23.4s, v24.4s, v7.4s
- ext v7.16b, v6.16b, v6.16b, #4
- ext v25.16b, v4.16b, v4.16b, #12
- ext v5.16b, v22.16b, v5.16b, #8
- eor v2.16b, v23.16b, v2.16b
- uzp1 v4.4s, v4.4s, v25.4s
- uzp1 v22.4s, v7.4s, v7.4s
- ext v25.16b, v7.16b, v7.16b, #12
- ext v22.16b, v22.16b, v7.16b, #8
- ext v7.16b, v7.16b, v25.16b, #12
- ushr v25.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- orr v2.16b, v2.16b, v25.16b
- add v3.4s, v3.4s, v2.4s
- eor v24.16b, v3.16b, v24.16b
- add v3.4s, v3.4s, v17.4s
- ushr v17.4s, v24.4s, #8
- shl v18.4s, v24.4s, #24
- orr v17.16b, v18.16b, v17.16b
- add v18.4s, v17.4s, v23.4s
- eor v2.16b, v18.16b, v2.16b
- ushr v23.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- ext v3.16b, v3.16b, v3.16b, #12
- orr v2.16b, v2.16b, v23.16b
- ext v17.16b, v17.16b, v17.16b, #8
- add v3.4s, v2.4s, v3.4s
- adrp x11, .LCPI0_1
- eor v17.16b, v3.16b, v17.16b
- ldr q16, [x11, :lo12:.LCPI0_1]
- ext v18.16b, v18.16b, v18.16b, #4
- rev32 v24.8h, v17.8h
- movi v0.2d, #0xffffffff00000000
- add v23.4s, v3.4s, v21.4s
- mov v21.s[1], v20.s[2]
- add v20.4s, v18.4s, v24.4s
- bit v19.16b, v21.16b, v0.16b
- eor v3.16b, v20.16b, v2.16b
- uzp2 v2.4s, v22.4s, v19.4s
- zip1 v17.2d, v5.2d, v19.2d
- zip2 v18.4s, v19.4s, v5.4s
- ushr v21.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- ext v22.16b, v2.16b, v2.16b, #4
- bsl v16.16b, v4.16b, v17.16b
- zip1 v17.4s, v18.4s, v4.4s
- zip1 v18.4s, v4.4s, v18.4s
- orr v21.16b, v3.16b, v21.16b
- ext v25.16b, v16.16b, v16.16b, #12
- ext v3.16b, v18.16b, v17.16b, #8
- uzp1 v18.4s, v22.4s, v22.4s
- ext v26.16b, v22.16b, v22.16b, #12
- add v23.4s, v23.4s, v21.4s
- uzp1 v17.4s, v16.4s, v25.4s
- ext v16.16b, v18.16b, v22.16b, #8
- ext v18.16b, v22.16b, v26.16b, #12
- eor v22.16b, v23.16b, v24.16b
- add v6.4s, v23.4s, v6.4s
- ushr v23.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v23.16b
- add v20.4s, v22.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v23.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v21.16b, v21.16b, v23.16b
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v21.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v20.16b, v20.16b, v20.16b, #12
- add v6.4s, v6.4s, v19.4s
- rev32 v19.8h, v22.8h
- add v20.4s, v20.4s, v19.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v22.4s, v21.4s, #12
- shl v21.4s, v21.4s, #20
- orr v21.16b, v21.16b, v22.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ushr v22.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v22.16b
- add v20.4s, v19.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #12
- ushr v22.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- add v6.4s, v6.4s, v4.4s
- orr v21.16b, v21.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ext v20.16b, v20.16b, v20.16b, #4
- rev32 v19.8h, v19.8h
- add v20.4s, v20.4s, v19.4s
- add v6.4s, v6.4s, v5.4s
- mov v5.s[1], v4.s[2]
- eor v4.16b, v20.16b, v21.16b
- ushr v21.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v21.16b, v4.16b, v21.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- add v2.4s, v6.4s, v2.4s
- ushr v6.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v6.16b, v19.16b, v6.16b
- add v19.4s, v6.4s, v20.4s
- eor v20.16b, v19.16b, v21.16b
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v20.4s, v2.4s
- eor v6.16b, v2.16b, v6.16b
- ext v19.16b, v19.16b, v19.16b, #12
- rev32 v6.8h, v6.8h
- add v19.4s, v19.4s, v6.4s
- mov v22.16b, v0.16b
- eor v20.16b, v19.16b, v20.16b
- bsl v22.16b, v5.16b, v7.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- add v2.4s, v2.4s, v22.4s
- orr v20.16b, v20.16b, v21.16b
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- ushr v21.4s, v6.4s, #8
- shl v6.4s, v6.4s, #24
- orr v6.16b, v6.16b, v21.16b
- add v19.4s, v6.4s, v19.4s
- eor v20.16b, v19.16b, v20.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v2.4s, v2.4s, v17.4s
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- uzp2 v5.4s, v16.4s, v22.4s
- zip1 v7.2d, v3.2d, v22.2d
- zip2 v16.4s, v22.4s, v3.4s
- ext v19.16b, v19.16b, v19.16b, #4
- rev32 v22.8h, v6.8h
- ext v23.16b, v5.16b, v5.16b, #4
- bif v7.16b, v17.16b, v1.16b
- zip1 v24.4s, v16.4s, v17.4s
- zip1 v16.4s, v17.4s, v16.4s
- add v21.4s, v2.4s, v3.4s
- mov v3.s[1], v17.s[2]
- add v17.4s, v19.4s, v22.4s
- mov v19.16b, v0.16b
- ext v25.16b, v7.16b, v7.16b, #12
- ext v4.16b, v16.16b, v24.16b, #8
- uzp1 v16.4s, v23.4s, v23.4s
- bsl v19.16b, v3.16b, v18.16b
- eor v2.16b, v17.16b, v20.16b
- uzp1 v7.4s, v7.4s, v25.4s
- ext v25.16b, v16.16b, v23.16b, #8
- zip1 v3.2d, v4.2d, v19.2d
- ushr v20.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- ext v24.16b, v23.16b, v23.16b, #12
- uzp2 v6.4s, v25.4s, v19.4s
- zip2 v18.4s, v19.4s, v4.4s
- bif v3.16b, v7.16b, v1.16b
- orr v20.16b, v2.16b, v20.16b
- ext v16.16b, v23.16b, v24.16b, #12
- ext v23.16b, v6.16b, v6.16b, #4
- zip1 v24.4s, v18.4s, v7.4s
- zip1 v18.4s, v7.4s, v18.4s
- ext v25.16b, v3.16b, v3.16b, #12
- add v21.4s, v21.4s, v20.4s
- ext v2.16b, v18.16b, v24.16b, #8
- uzp1 v18.4s, v23.4s, v23.4s
- ext v24.16b, v23.16b, v23.16b, #12
- uzp1 v3.4s, v3.4s, v25.4s
- eor v22.16b, v21.16b, v22.16b
- ext v25.16b, v18.16b, v23.16b, #8
- dup v18.4s, v2.s[3]
- ext v23.16b, v23.16b, v24.16b, #12
- add v5.4s, v21.4s, v5.4s
- trn1 v21.4s, v3.4s, v3.4s
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- ext v18.16b, v21.16b, v18.16b, #8
- orr v21.16b, v22.16b, v24.16b
- add v17.4s, v21.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v22.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v20.16b, v20.16b, v22.16b
- ext v21.16b, v21.16b, v21.16b, #8
- add v5.4s, v20.4s, v5.4s
- eor v21.16b, v5.16b, v21.16b
- ext v17.16b, v17.16b, v17.16b, #12
- add v5.4s, v5.4s, v19.4s
- rev32 v19.8h, v21.8h
- add v17.4s, v17.4s, v19.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v21.16b
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ushr v21.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v21.16b
- add v17.4s, v19.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ext v5.16b, v5.16b, v5.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v5.4s, v5.4s, v7.4s
- orr v20.16b, v20.16b, v21.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ext v17.16b, v17.16b, v17.16b, #4
- rev32 v22.8h, v19.8h
- add v21.4s, v5.4s, v4.4s
- mov v4.s[1], v7.s[2]
- add v19.4s, v17.4s, v22.4s
- bit v16.16b, v4.16b, v0.16b
- eor v5.16b, v19.16b, v20.16b
- uzp2 v4.4s, v25.4s, v16.4s
- zip1 v7.2d, v2.2d, v16.2d
- zip2 v17.4s, v16.4s, v2.4s
- ushr v20.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- ext v24.16b, v4.16b, v4.16b, #4
- bif v7.16b, v3.16b, v1.16b
- zip1 v25.4s, v17.4s, v3.4s
- zip1 v17.4s, v3.4s, v17.4s
- orr v20.16b, v5.16b, v20.16b
- ext v26.16b, v7.16b, v7.16b, #12
- ext v5.16b, v17.16b, v25.16b, #8
- uzp1 v17.4s, v24.4s, v24.4s
- ext v25.16b, v24.16b, v24.16b, #12
- bit v23.16b, v18.16b, v0.16b
- add v21.4s, v21.4s, v20.4s
- uzp1 v7.4s, v7.4s, v26.4s
- ext v26.16b, v17.16b, v24.16b, #8
- ext v17.16b, v24.16b, v25.16b, #12
- eor v22.16b, v21.16b, v22.16b
- add v6.4s, v21.4s, v6.4s
- zip1 v21.2d, v5.2d, v23.2d
- zip2 v24.4s, v23.4s, v5.4s
- bif v21.16b, v7.16b, v1.16b
- zip1 v1.4s, v24.4s, v7.4s
- zip1 v24.4s, v7.4s, v24.4s
- ext v1.16b, v24.16b, v1.16b, #8
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v24.16b
- add v19.4s, v22.4s, v19.4s
- ext v24.16b, v21.16b, v21.16b, #12
- eor v20.16b, v19.16b, v20.16b
- uzp1 v21.4s, v21.4s, v24.4s
- ushr v24.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- orr v20.16b, v20.16b, v24.16b
- ext v6.16b, v6.16b, v6.16b, #4
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v20.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #12
- add v6.4s, v6.4s, v16.4s
- rev32 v16.8h, v22.8h
- add v19.4s, v19.4s, v16.4s
- eor v20.16b, v19.16b, v20.16b
- ushr v22.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v22.16b
- add v6.4s, v6.4s, v20.4s
- eor v16.16b, v6.16b, v16.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v6.4s, v3.4s
- ushr v6.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v6.16b, v16.16b, v6.16b
- add v16.4s, v6.4s, v19.4s
- eor v19.16b, v16.16b, v20.16b
- ushr v20.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v20.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v3.4s, v3.4s, v19.4s
- eor v6.16b, v3.16b, v6.16b
- ext v16.16b, v16.16b, v16.16b, #4
- add v2.4s, v3.4s, v2.4s
- rev32 v3.8h, v6.8h
- add v6.4s, v16.4s, v3.4s
- eor v16.16b, v6.16b, v19.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- orr v16.16b, v16.16b, v19.16b
- add v2.4s, v2.4s, v16.4s
- eor v3.16b, v2.16b, v3.16b
- add v2.4s, v2.4s, v4.4s
- ushr v4.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v4.16b
- add v4.4s, v3.4s, v6.4s
- eor v6.16b, v4.16b, v16.16b
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v6.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
- rev32 v3.8h, v3.8h
- add v4.4s, v4.4s, v3.4s
- eor v6.16b, v4.16b, v6.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- add v2.4s, v2.4s, v23.4s
- orr v6.16b, v6.16b, v16.16b
- add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v16.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v16.16b
- add v4.4s, v3.4s, v4.4s
- eor v6.16b, v4.16b, v6.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- add v2.4s, v2.4s, v7.4s
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #4
- rev32 v3.8h, v3.8h
- add v2.4s, v2.4s, v5.4s
- mov v5.s[1], v7.s[2]
- add v4.4s, v4.4s, v3.4s
- bsl v0.16b, v5.16b, v17.16b
- eor v5.16b, v4.16b, v6.16b
- ushr v6.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v6.16b
- add v2.4s, v2.4s, v5.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v6.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v6.16b
- add v4.4s, v3.4s, v4.4s
- uzp2 v18.4s, v26.4s, v18.4s
- eor v5.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v18.4s
- ushr v6.4s, v5.4s, #7
- shl v5.4s, v5.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v5.16b, v5.16b, v6.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v5.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
- add v0.4s, v2.4s, v0.4s
- rev32 v2.8h, v3.8h
- add v3.4s, v4.4s, v2.4s
- eor v4.16b, v3.16b, v5.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v4.16b, v4.16b, v5.16b
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ushr v5.4s, v2.4s, #8
- shl v2.4s, v2.4s, #24
- orr v2.16b, v2.16b, v5.16b
- add v3.4s, v2.4s, v3.4s
- eor v4.16b, v3.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #12
- ushr v5.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v0.4s, v0.4s, v21.4s
- orr v4.16b, v4.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ext v3.16b, v3.16b, v3.16b, #4
- add v0.4s, v0.4s, v1.4s
- rev32 v1.8h, v2.8h
- add v2.4s, v3.4s, v1.4s
- eor v3.16b, v2.16b, v4.16b
- ushr v4.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v4.16b
- add v0.4s, v0.4s, v3.4s
- eor v1.16b, v0.16b, v1.16b
- ushr v4.4s, v1.4s, #8
- shl v1.4s, v1.4s, #24
- orr v1.16b, v1.16b, v4.16b
- add v2.4s, v1.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v0.16b, v0.16b, v0.16b, #4
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v4.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- ext v1.16b, v1.16b, v1.16b, #8
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ str x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x19, x0
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x19
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- orr v2.16b, v3.16b, v4.16b
- eor v1.16b, v2.16b, v1.16b
- stp q0, q1, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
.Lfunc_end0:
.size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI1_0:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI1_1:
- .xword 0
- .xword -4294967296
-.LCPI1_2:
- .xword -1
- .xword 4294967295
+ .xword -4942790177982912921
+ .xword -6534734903820487822
.text
- .globl zfs_blake3_compress_xof_sse2
.p2align 2
- .type zfs_blake3_compress_xof_sse2,@function
-zfs_blake3_compress_xof_sse2:
+ .type compress_pre,@function
+compress_pre:
.cfi_startproc
- ldp q3, q2, [x0]
- ldp q5, q6, [x1]
- add x10, x1, #32
- lsr x11, x3, #32
- fmov s4, w3
- ld2 { v17.4s, v18.4s }, [x10]
- adrp x10, .LCPI1_2
- and w8, w2, #0xff
- mov v4.s[1], w11
- ldr q1, [x10, :lo12:.LCPI1_2]
- and w9, w4, #0xff
- adrp x12, .LCPI1_0
- mov v4.s[2], w8
- uzp1 v19.4s, v5.4s, v6.4s
- add v3.4s, v2.4s, v3.4s
- ldr q7, [x12, :lo12:.LCPI1_0]
- mov v4.s[3], w9
- add v3.4s, v3.4s, v19.4s
- uzp2 v5.4s, v5.4s, v6.4s
- ext v21.16b, v18.16b, v18.16b, #12
- uzp1 v6.4s, v19.4s, v19.4s
- ext v22.16b, v19.16b, v19.16b, #12
- eor v4.16b, v3.16b, v4.16b
- ext v20.16b, v17.16b, v17.16b, #12
- ext v6.16b, v6.16b, v19.16b, #8
- ext v19.16b, v19.16b, v22.16b, #12
- zip1 v22.2d, v21.2d, v5.2d
- rev32 v24.8h, v4.8h
- mov v4.16b, v1.16b
- zip2 v23.4s, v5.4s, v21.4s
- uzp2 v6.4s, v6.4s, v5.4s
- bsl v4.16b, v22.16b, v20.16b
- add v3.4s, v3.4s, v5.4s
- zip1 v5.4s, v23.4s, v20.4s
- zip1 v22.4s, v20.4s, v23.4s
- add v23.4s, v24.4s, v7.4s
- ext v7.16b, v6.16b, v6.16b, #4
- ext v25.16b, v4.16b, v4.16b, #12
- ext v5.16b, v22.16b, v5.16b, #8
- eor v2.16b, v23.16b, v2.16b
- uzp1 v4.4s, v4.4s, v25.4s
- uzp1 v22.4s, v7.4s, v7.4s
- ext v25.16b, v7.16b, v7.16b, #12
- ext v22.16b, v22.16b, v7.16b, #8
- ext v7.16b, v7.16b, v25.16b, #12
- ushr v25.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- orr v2.16b, v2.16b, v25.16b
- add v3.4s, v3.4s, v2.4s
- eor v24.16b, v3.16b, v24.16b
- add v3.4s, v3.4s, v17.4s
- ushr v17.4s, v24.4s, #8
- shl v18.4s, v24.4s, #24
- orr v17.16b, v18.16b, v17.16b
- add v18.4s, v17.4s, v23.4s
- eor v2.16b, v18.16b, v2.16b
- ushr v23.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- ext v3.16b, v3.16b, v3.16b, #12
- orr v2.16b, v2.16b, v23.16b
- ext v17.16b, v17.16b, v17.16b, #8
- add v3.4s, v2.4s, v3.4s
- adrp x11, .LCPI1_1
- eor v17.16b, v3.16b, v17.16b
- ldr q16, [x11, :lo12:.LCPI1_1]
- ext v18.16b, v18.16b, v18.16b, #4
- rev32 v24.8h, v17.8h
- movi v0.2d, #0xffffffff00000000
- add v23.4s, v3.4s, v21.4s
- mov v21.s[1], v20.s[2]
- add v20.4s, v18.4s, v24.4s
- bit v19.16b, v21.16b, v0.16b
- eor v3.16b, v20.16b, v2.16b
- uzp2 v2.4s, v22.4s, v19.4s
- zip1 v17.2d, v5.2d, v19.2d
- zip2 v18.4s, v19.4s, v5.4s
- ushr v21.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- ext v22.16b, v2.16b, v2.16b, #4
- bsl v16.16b, v4.16b, v17.16b
- zip1 v17.4s, v18.4s, v4.4s
- zip1 v18.4s, v4.4s, v18.4s
- orr v21.16b, v3.16b, v21.16b
- ext v25.16b, v16.16b, v16.16b, #12
- ext v3.16b, v18.16b, v17.16b, #8
- uzp1 v18.4s, v22.4s, v22.4s
- ext v26.16b, v22.16b, v22.16b, #12
- add v23.4s, v23.4s, v21.4s
- uzp1 v17.4s, v16.4s, v25.4s
- ext v16.16b, v18.16b, v22.16b, #8
- ext v18.16b, v22.16b, v26.16b, #12
- eor v22.16b, v23.16b, v24.16b
- add v6.4s, v23.4s, v6.4s
- ushr v23.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v23.16b
- add v20.4s, v22.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v23.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v21.16b, v21.16b, v23.16b
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v21.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v20.16b, v20.16b, v20.16b, #12
- add v6.4s, v6.4s, v19.4s
- rev32 v19.8h, v22.8h
- add v20.4s, v20.4s, v19.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v22.4s, v21.4s, #12
- shl v21.4s, v21.4s, #20
- orr v21.16b, v21.16b, v22.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ushr v22.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v22.16b
- add v20.4s, v19.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #12
- ushr v22.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- add v6.4s, v6.4s, v4.4s
- orr v21.16b, v21.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ext v20.16b, v20.16b, v20.16b, #4
- rev32 v19.8h, v19.8h
- add v20.4s, v20.4s, v19.4s
- add v6.4s, v6.4s, v5.4s
- mov v5.s[1], v4.s[2]
- eor v4.16b, v20.16b, v21.16b
- ushr v21.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v21.16b, v4.16b, v21.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- add v2.4s, v6.4s, v2.4s
- ushr v6.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v6.16b, v19.16b, v6.16b
- add v19.4s, v6.4s, v20.4s
- eor v20.16b, v19.16b, v21.16b
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v20.4s, v2.4s
- eor v6.16b, v2.16b, v6.16b
- ext v19.16b, v19.16b, v19.16b, #12
- rev32 v6.8h, v6.8h
- add v19.4s, v19.4s, v6.4s
- mov v22.16b, v0.16b
- eor v20.16b, v19.16b, v20.16b
- bsl v22.16b, v5.16b, v7.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- add v2.4s, v2.4s, v22.4s
- orr v20.16b, v20.16b, v21.16b
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- ushr v21.4s, v6.4s, #8
- shl v6.4s, v6.4s, #24
- orr v6.16b, v6.16b, v21.16b
- add v19.4s, v6.4s, v19.4s
- eor v20.16b, v19.16b, v20.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v2.4s, v2.4s, v17.4s
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- uzp2 v5.4s, v16.4s, v22.4s
- zip1 v7.2d, v3.2d, v22.2d
- zip2 v16.4s, v22.4s, v3.4s
- ext v19.16b, v19.16b, v19.16b, #4
- rev32 v22.8h, v6.8h
- ext v23.16b, v5.16b, v5.16b, #4
- bif v7.16b, v17.16b, v1.16b
- zip1 v24.4s, v16.4s, v17.4s
- zip1 v16.4s, v17.4s, v16.4s
- add v21.4s, v2.4s, v3.4s
- mov v3.s[1], v17.s[2]
- add v17.4s, v19.4s, v22.4s
- mov v19.16b, v0.16b
- ext v25.16b, v7.16b, v7.16b, #12
- ext v4.16b, v16.16b, v24.16b, #8
- uzp1 v16.4s, v23.4s, v23.4s
- bsl v19.16b, v3.16b, v18.16b
- eor v2.16b, v17.16b, v20.16b
- uzp1 v7.4s, v7.4s, v25.4s
- ext v25.16b, v16.16b, v23.16b, #8
- zip1 v3.2d, v4.2d, v19.2d
- ushr v20.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- ext v24.16b, v23.16b, v23.16b, #12
- uzp2 v6.4s, v25.4s, v19.4s
- zip2 v18.4s, v19.4s, v4.4s
- bif v3.16b, v7.16b, v1.16b
- orr v20.16b, v2.16b, v20.16b
- ext v16.16b, v23.16b, v24.16b, #12
- ext v23.16b, v6.16b, v6.16b, #4
- zip1 v24.4s, v18.4s, v7.4s
- zip1 v18.4s, v7.4s, v18.4s
- ext v25.16b, v3.16b, v3.16b, #12
- add v21.4s, v21.4s, v20.4s
- ext v2.16b, v18.16b, v24.16b, #8
- uzp1 v18.4s, v23.4s, v23.4s
- ext v24.16b, v23.16b, v23.16b, #12
- uzp1 v3.4s, v3.4s, v25.4s
- eor v22.16b, v21.16b, v22.16b
- ext v25.16b, v18.16b, v23.16b, #8
- dup v18.4s, v2.s[3]
- ext v23.16b, v23.16b, v24.16b, #12
- add v5.4s, v21.4s, v5.4s
- trn1 v21.4s, v3.4s, v3.4s
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- ext v18.16b, v21.16b, v18.16b, #8
- orr v21.16b, v22.16b, v24.16b
- add v17.4s, v21.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v22.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v20.16b, v20.16b, v22.16b
- ext v21.16b, v21.16b, v21.16b, #8
- add v5.4s, v20.4s, v5.4s
- eor v21.16b, v5.16b, v21.16b
- ext v17.16b, v17.16b, v17.16b, #12
- add v5.4s, v5.4s, v19.4s
- rev32 v19.8h, v21.8h
- add v17.4s, v17.4s, v19.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v21.16b
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ushr v21.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v21.16b
- add v17.4s, v19.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ext v5.16b, v5.16b, v5.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v5.4s, v5.4s, v7.4s
- orr v20.16b, v20.16b, v21.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ext v17.16b, v17.16b, v17.16b, #4
- rev32 v22.8h, v19.8h
- add v21.4s, v5.4s, v4.4s
- mov v4.s[1], v7.s[2]
- add v19.4s, v17.4s, v22.4s
- bit v16.16b, v4.16b, v0.16b
- eor v5.16b, v19.16b, v20.16b
- uzp2 v4.4s, v25.4s, v16.4s
- zip1 v7.2d, v2.2d, v16.2d
- zip2 v17.4s, v16.4s, v2.4s
- ushr v20.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- ext v24.16b, v4.16b, v4.16b, #4
- bif v7.16b, v3.16b, v1.16b
- zip1 v25.4s, v17.4s, v3.4s
- zip1 v17.4s, v3.4s, v17.4s
- orr v20.16b, v5.16b, v20.16b
- ext v26.16b, v7.16b, v7.16b, #12
- ext v5.16b, v17.16b, v25.16b, #8
- uzp1 v17.4s, v24.4s, v24.4s
- ext v25.16b, v24.16b, v24.16b, #12
- bit v23.16b, v18.16b, v0.16b
- add v21.4s, v21.4s, v20.4s
- uzp1 v7.4s, v7.4s, v26.4s
- ext v26.16b, v17.16b, v24.16b, #8
- ext v17.16b, v24.16b, v25.16b, #12
- eor v22.16b, v21.16b, v22.16b
- add v6.4s, v21.4s, v6.4s
- zip1 v21.2d, v5.2d, v23.2d
- zip2 v24.4s, v23.4s, v5.4s
- bif v21.16b, v7.16b, v1.16b
- zip1 v1.4s, v24.4s, v7.4s
- zip1 v24.4s, v7.4s, v24.4s
- ext v1.16b, v24.16b, v1.16b, #8
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v24.16b
- add v19.4s, v22.4s, v19.4s
- ext v24.16b, v21.16b, v21.16b, #12
- eor v20.16b, v19.16b, v20.16b
- uzp1 v21.4s, v21.4s, v24.4s
- ushr v24.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- orr v20.16b, v20.16b, v24.16b
- ext v6.16b, v6.16b, v6.16b, #4
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v20.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #12
- add v6.4s, v6.4s, v16.4s
- rev32 v16.8h, v22.8h
- add v19.4s, v19.4s, v16.4s
- eor v20.16b, v19.16b, v20.16b
- ushr v22.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v22.16b
- add v6.4s, v6.4s, v20.4s
- eor v16.16b, v6.16b, v16.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v6.4s, v3.4s
- ushr v6.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v6.16b, v16.16b, v6.16b
- add v16.4s, v6.4s, v19.4s
- eor v19.16b, v16.16b, v20.16b
- ushr v20.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v20.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v3.4s, v3.4s, v19.4s
- eor v6.16b, v3.16b, v6.16b
- ext v16.16b, v16.16b, v16.16b, #4
- add v2.4s, v3.4s, v2.4s
- rev32 v3.8h, v6.8h
- add v6.4s, v16.4s, v3.4s
- eor v16.16b, v6.16b, v19.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- orr v16.16b, v16.16b, v19.16b
- add v2.4s, v2.4s, v16.4s
- eor v3.16b, v2.16b, v3.16b
- add v2.4s, v2.4s, v4.4s
- ushr v4.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v4.16b
- add v4.4s, v3.4s, v6.4s
- eor v6.16b, v4.16b, v16.16b
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v6.4s, v2.4s
+ hint #34
+ fmov s1, w3
+ movi d0, #0x0000ff000000ff
+ ldr q2, [x1]
+ fmov d3, x4
+ adrp x8, .LCPI1_0
+ mov v1.s[1], w5
+ str q2, [x0]
+ ldr q4, [x8, :lo12:.LCPI1_0]
+ add x8, x2, #32
+ ldr q5, [x1, #16]
+ and v0.8b, v1.8b, v0.8b
+ stp q5, q4, [x0, #16]
+ mov v3.d[1], v0.d[0]
+ str q3, [x0, #48]
+ ldp q0, q6, [x2]
+ uzp1 v1.4s, v0.4s, v6.4s
+ uzp2 v0.4s, v0.4s, v6.4s
+ add v2.4s, v2.4s, v1.4s
+ uzp1 v18.4s, v1.4s, v1.4s
+ add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
+ add v2.4s, v2.4s, v0.4s
rev32 v3.8h, v3.8h
- add v4.4s, v4.4s, v3.4s
- eor v6.16b, v4.16b, v6.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- add v2.4s, v2.4s, v23.4s
- orr v6.16b, v6.16b, v16.16b
- add v2.4s, v2.4s, v6.4s
+ add v4.4s, v3.4s, v4.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ushr v16.4s, v3.4s, #8
+ ushr v6.4s, v3.4s, #8
shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v16.16b
+ orr v3.16b, v3.16b, v6.16b
+ ld2 { v6.4s, v7.4s }, [x8]
add v4.4s, v3.4s, v4.4s
- eor v6.16b, v4.16b, v6.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- add v2.4s, v2.4s, v7.4s
- orr v6.16b, v6.16b, v16.16b
ext v3.16b, v3.16b, v3.16b, #8
add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
+ eor v5.16b, v4.16b, v5.16b
ext v4.16b, v4.16b, v4.16b, #4
- rev32 v3.8h, v3.8h
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v7.16b, v7.16b, #12
add v2.4s, v2.4s, v5.4s
- mov v5.s[1], v7.s[2]
+ mov v7.16b, v16.16b
+ eor v3.16b, v3.16b, v2.16b
+ add v2.4s, v2.4s, v16.4s
+ mov v7.s[1], v6.s[2]
+ rev32 v3.8h, v3.8h
add v4.4s, v4.4s, v3.4s
- bsl v0.16b, v5.16b, v17.16b
- eor v5.16b, v4.16b, v6.16b
- ushr v6.4s, v5.4s, #12
+ eor v5.16b, v4.16b, v5.16b
+ ushr v17.4s, v5.4s, #12
shl v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v6.16b
+ orr v5.16b, v5.16b, v17.16b
add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ushr v6.4s, v3.4s, #8
+ ushr v17.4s, v3.4s, #8
shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v6.16b
+ orr v3.16b, v3.16b, v17.16b
+ ext v17.16b, v18.16b, v1.16b, #8
add v4.4s, v3.4s, v4.4s
- uzp2 v18.4s, v26.4s, v18.4s
+ uzp2 v17.4s, v17.4s, v0.4s
+ ext v3.16b, v3.16b, v3.16b, #8
eor v5.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v18.4s
- ushr v6.4s, v5.4s, #7
+ add v2.4s, v2.4s, v17.4s
+ ext v4.16b, v4.16b, v4.16b, #12
+ ushr v18.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
ext v2.16b, v2.16b, v2.16b, #4
- orr v5.16b, v5.16b, v6.16b
+ orr v5.16b, v5.16b, v18.16b
+ ext v18.16b, v1.16b, v1.16b, #12
+ add v2.4s, v2.4s, v5.4s
+ ext v1.16b, v1.16b, v18.16b, #12
+ zip1 v18.2d, v16.2d, v0.2d
+ zip2 v0.4s, v0.4s, v16.4s
+ eor v3.16b, v3.16b, v2.16b
+ rev64 v1.4s, v1.4s
+ mov v18.s[3], v6.s[3]
+ zip1 v16.4s, v0.4s, v6.4s
+ rev32 v3.8h, v3.8h
+ trn2 v1.4s, v1.4s, v7.4s
+ zip1 v0.4s, v6.4s, v0.4s
+ add v4.4s, v4.4s, v3.4s
+ add v2.4s, v2.4s, v1.4s
+ ext v6.16b, v0.16b, v16.16b, #8
+ eor v5.16b, v4.16b, v5.16b
+ ushr v7.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v7.16b
+ add v7.4s, v2.4s, v5.4s
+ eor v2.16b, v7.16b, v3.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v3.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v3.16b, v2.16b, v3.16b
+ ext v2.16b, v18.16b, v18.16b, #12
+ add v4.4s, v3.4s, v4.4s
+ uzp1 v2.4s, v18.4s, v2.4s
ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v5.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ add v7.4s, v7.4s, v2.4s
+ ext v4.16b, v4.16b, v4.16b, #4
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ add v7.4s, v7.4s, v5.4s
+ eor v3.16b, v3.16b, v7.16b
+ add v7.4s, v7.4s, v6.4s
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v0.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v0.16b, v5.16b, v0.16b
+ add v5.4s, v7.4s, v0.4s
+ ext v7.16b, v17.16b, v17.16b, #4
+ eor v3.16b, v5.16b, v3.16b
+ uzp1 v17.4s, v7.4s, v7.4s
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v16.4s, v16.4s, v1.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v0.16b, v4.16b, v0.16b
+ add v5.4s, v5.4s, v16.4s
+ ext v4.16b, v4.16b, v4.16b, #12
+ ushr v17.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v0.16b, v0.16b, v17.16b
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v5.4s, v5.4s, v0.4s
+ ext v7.16b, v7.16b, v17.16b, #12
+ mov v17.16b, v6.16b
+ eor v3.16b, v3.16b, v5.16b
+ rev64 v7.4s, v7.4s
+ mov v17.s[1], v2.s[2]
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v18.16b, v4.16b, v0.16b
+ trn2 v0.4s, v7.4s, v17.4s
+ ushr v7.4s, v18.4s, #12
+ shl v17.4s, v18.4s, #20
+ add v5.4s, v5.4s, v0.4s
+ zip1 v18.2d, v6.2d, v1.2d
+ zip2 v1.4s, v1.4s, v6.4s
+ orr v7.16b, v17.16b, v7.16b
+ mov v18.s[3], v2.s[3]
+ zip1 v6.4s, v1.4s, v2.4s
+ add v5.4s, v5.4s, v7.4s
+ zip1 v1.4s, v2.4s, v1.4s
+ eor v3.16b, v5.16b, v3.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v1.16b, v6.16b, #8
+ ushr v17.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v17.16b, v3.16b, v17.16b
+ ext v3.16b, v18.16b, v18.16b, #12
+ add v4.4s, v17.4s, v4.4s
+ uzp1 v3.4s, v18.4s, v3.4s
+ ext v17.16b, v17.16b, v17.16b, #8
+ eor v7.16b, v4.16b, v7.16b
+ add v5.4s, v5.4s, v3.4s
+ ext v4.16b, v4.16b, v4.16b, #4
+ ushr v18.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ orr v7.16b, v7.16b, v18.16b
+ add v5.4s, v5.4s, v7.4s
+ eor v17.16b, v17.16b, v5.16b
+ add v5.4s, v5.4s, v6.4s
+ rev32 v17.8h, v17.8h
+ add v4.4s, v4.4s, v17.4s
+ eor v2.16b, v4.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ ushr v1.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v2.16b, v1.16b
+ add v2.4s, v5.4s, v1.4s
+ eor v5.16b, v2.16b, v17.16b
+ uzp1 v17.4s, v7.4s, v7.4s
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v4.4s, v5.4s, v4.4s
+ uzp2 v16.4s, v16.4s, v0.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v1.16b, v4.16b, v1.16b
+ add v2.4s, v2.4s, v16.4s
ext v4.16b, v4.16b, v4.16b, #12
- add v0.4s, v2.4s, v0.4s
- rev32 v2.8h, v3.8h
- add v3.4s, v4.4s, v2.4s
- eor v4.16b, v3.16b, v5.16b
- ushr v5.4s, v4.4s, #12
+ ushr v17.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v1.16b, v1.16b, v17.16b
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v2.4s, v2.4s, v1.4s
+ ext v7.16b, v7.16b, v17.16b, #12
+ mov v17.16b, v6.16b
+ eor v5.16b, v5.16b, v2.16b
+ rev64 v7.4s, v7.4s
+ mov v17.s[1], v3.s[2]
+ rev32 v5.8h, v5.8h
+ add v4.4s, v4.4s, v5.4s
+ eor v18.16b, v4.16b, v1.16b
+ trn2 v1.4s, v7.4s, v17.4s
+ ushr v7.4s, v18.4s, #12
+ shl v17.4s, v18.4s, #20
+ add v2.4s, v2.4s, v1.4s
+ zip1 v18.2d, v6.2d, v0.2d
+ zip2 v0.4s, v0.4s, v6.4s
+ orr v7.16b, v17.16b, v7.16b
+ mov v18.s[3], v3.s[3]
+ add v2.4s, v2.4s, v7.4s
+ eor v5.16b, v2.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v17.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v17.16b
+ add v17.4s, v5.4s, v4.4s
+ ext v4.16b, v18.16b, v18.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v7.16b, v17.16b, v7.16b
+ uzp1 v4.4s, v18.4s, v4.4s
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v18.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ add v2.4s, v2.4s, v4.4s
+ orr v7.16b, v7.16b, v18.16b
+ add v2.4s, v2.4s, v7.4s
+ eor v5.16b, v5.16b, v2.16b
+ rev32 v5.8h, v5.8h
+ add v6.4s, v17.4s, v5.4s
+ zip1 v17.4s, v0.4s, v3.4s
+ zip1 v0.4s, v3.4s, v0.4s
+ eor v3.16b, v6.16b, v7.16b
+ ext v0.16b, v0.16b, v17.16b, #8
+ ushr v7.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v2.4s, v2.4s, v0.4s
+ orr v3.16b, v3.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ add v2.4s, v2.4s, v3.4s
+ uzp1 v17.4s, v7.4s, v7.4s
+ eor v5.16b, v2.16b, v5.16b
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v6.4s, v5.4s, v6.4s
+ uzp2 v16.4s, v16.4s, v1.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v3.16b, v6.16b, v3.16b
+ add v2.4s, v2.4s, v16.4s
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v17.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v3.16b, v3.16b, v17.16b
+ add v17.4s, v2.4s, v3.4s
+ eor v2.16b, v5.16b, v17.16b
+ ext v5.16b, v7.16b, v7.16b, #12
+ rev32 v18.8h, v2.8h
+ ext v2.16b, v7.16b, v5.16b, #12
+ mov v5.16b, v0.16b
+ add v6.4s, v6.4s, v18.4s
+ rev64 v2.4s, v2.4s
+ mov v5.s[1], v4.s[2]
+ eor v3.16b, v6.16b, v3.16b
+ trn2 v2.4s, v2.4s, v5.4s
+ ushr v5.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v7.4s, v17.4s, v2.4s
+ orr v3.16b, v3.16b, v5.16b
+ add v5.4s, v7.4s, v3.4s
+ eor v7.16b, v5.16b, v18.16b
+ zip1 v18.2d, v0.2d, v1.2d
+ ext v5.16b, v5.16b, v5.16b, #12
+ zip2 v0.4s, v1.4s, v0.4s
+ ushr v17.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ mov v18.s[3], v4.s[3]
+ orr v7.16b, v7.16b, v17.16b
+ ext v17.16b, v18.16b, v18.16b, #12
+ add v6.4s, v7.4s, v6.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v19.16b, v6.16b, v3.16b
+ uzp1 v3.4s, v18.4s, v17.4s
+ ext v6.16b, v6.16b, v6.16b, #4
+ ushr v17.4s, v19.4s, #7
+ shl v18.4s, v19.4s, #25
+ add v5.4s, v5.4s, v3.4s
+ orr v17.16b, v18.16b, v17.16b
+ add v5.4s, v5.4s, v17.4s
+ eor v7.16b, v7.16b, v5.16b
+ rev32 v7.8h, v7.8h
+ add v1.4s, v6.4s, v7.4s
+ zip1 v6.4s, v0.4s, v4.4s
+ zip1 v0.4s, v4.4s, v0.4s
+ eor v4.16b, v1.16b, v17.16b
+ ext v6.16b, v0.16b, v6.16b, #8
+ ushr v0.4s, v4.4s, #12
shl v4.4s, v4.4s, #20
- orr v4.16b, v4.16b, v5.16b
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ushr v5.4s, v2.4s, #8
- shl v2.4s, v2.4s, #24
- orr v2.16b, v2.16b, v5.16b
- add v3.4s, v2.4s, v3.4s
- eor v4.16b, v3.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #12
- ushr v5.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v0.4s, v0.4s, v21.4s
- orr v4.16b, v4.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ext v3.16b, v3.16b, v3.16b, #4
- add v0.4s, v0.4s, v1.4s
- rev32 v1.8h, v2.8h
- add v2.4s, v3.4s, v1.4s
- eor v3.16b, v2.16b, v4.16b
- ushr v4.4s, v3.4s, #12
+ add v5.4s, v5.4s, v6.4s
+ zip1 v20.2d, v6.2d, v2.2d
+ orr v0.16b, v4.16b, v0.16b
+ mov v20.s[3], v3.s[3]
+ add v4.4s, v5.4s, v0.4s
+ eor v5.16b, v4.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ uzp1 v17.4s, v7.4s, v7.4s
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v1.4s, v5.4s, v1.4s
+ uzp2 v16.4s, v16.4s, v2.4s
+ zip2 v2.4s, v2.4s, v6.4s
+ eor v0.16b, v1.16b, v0.16b
+ add v4.4s, v4.4s, v16.4s
+ ext v1.16b, v1.16b, v1.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v17.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v0.16b, v17.16b
+ ext v0.16b, v5.16b, v5.16b, #8
+ ext v5.16b, v7.16b, v7.16b, #12
+ add v4.4s, v4.4s, v17.4s
+ eor v0.16b, v0.16b, v4.16b
+ rev32 v18.8h, v0.8h
+ ext v0.16b, v7.16b, v5.16b, #12
+ mov v5.16b, v6.16b
+ add v7.4s, v1.4s, v18.4s
+ rev64 v1.4s, v0.4s
+ mov v5.s[1], v3.s[2]
+ eor v17.16b, v7.16b, v17.16b
+ trn2 v1.4s, v1.4s, v5.4s
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v1.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v19.4s, v4.4s, v17.4s
+ eor v4.16b, v19.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ ushr v18.4s, v4.4s, #8
+ shl v4.4s, v4.4s, #24
+ orr v18.16b, v4.16b, v18.16b
+ ext v4.16b, v20.16b, v20.16b, #12
+ add v7.4s, v18.4s, v7.4s
+ uzp1 v4.4s, v20.4s, v4.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v17.16b, v7.16b, v17.16b
+ add v19.4s, v19.4s, v4.4s
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v20.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v17.16b, v17.16b, v20.16b
+ add v19.4s, v19.4s, v17.4s
+ eor v18.16b, v18.16b, v19.16b
+ rev32 v18.8h, v18.8h
+ add v6.4s, v7.4s, v18.4s
+ zip1 v7.4s, v2.4s, v3.4s
+ zip1 v2.4s, v3.4s, v2.4s
+ eor v3.16b, v6.16b, v17.16b
+ ext v2.16b, v2.16b, v7.16b, #8
+ ushr v7.4s, v3.4s, #12
shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v4.16b
- add v0.4s, v0.4s, v3.4s
- eor v1.16b, v0.16b, v1.16b
- ushr v4.4s, v1.4s, #8
- shl v1.4s, v1.4s, #24
- orr v1.16b, v1.16b, v4.16b
- add v2.4s, v1.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v4.4s, v3.4s, #7
+ add v17.4s, v19.4s, v2.4s
+ zip1 v1.2d, v2.2d, v1.2d
+ zip2 v0.4s, v0.4s, v2.4s
+ orr v3.16b, v3.16b, v7.16b
+ mov v1.s[3], v4.s[3]
+ add v7.4s, v17.4s, v3.4s
+ eor v17.16b, v7.16b, v18.16b
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v18.4s, v17.4s, #8
+ shl v17.4s, v17.4s, #24
+ orr v17.16b, v17.16b, v18.16b
+ ext v18.16b, v16.16b, v16.16b, #8
+ add v6.4s, v17.4s, v6.4s
+ uzp2 v5.4s, v18.4s, v5.4s
+ eor v3.16b, v6.16b, v3.16b
+ ext v5.16b, v5.16b, v18.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v18.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ add v5.4s, v7.4s, v5.4s
+ ext v7.16b, v17.16b, v17.16b, #8
+ ext v17.16b, v16.16b, v16.16b, #12
+ orr v3.16b, v3.16b, v18.16b
+ ext v16.16b, v16.16b, v17.16b, #12
+ add v5.4s, v3.4s, v5.4s
+ mov v17.16b, v2.16b
+ rev64 v16.4s, v16.4s
+ eor v7.16b, v7.16b, v5.16b
+ mov v17.s[1], v4.s[2]
+ rev32 v7.8h, v7.8h
+ trn2 v16.4s, v16.4s, v17.4s
+ add v6.4s, v6.4s, v7.4s
+ add v5.4s, v5.4s, v16.4s
+ eor v3.16b, v6.16b, v3.16b
+ ushr v17.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v17.16b
+ add v5.4s, v5.4s, v3.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v16.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ orr v7.16b, v7.16b, v16.16b
+ ext v16.16b, v1.16b, v1.16b, #12
+ add v6.4s, v7.4s, v6.4s
+ uzp1 v1.4s, v1.4s, v16.4s
+ eor v3.16b, v6.16b, v3.16b
+ add v1.4s, v5.4s, v1.4s
+ ext v5.16b, v7.16b, v7.16b, #8
+ ext v6.16b, v6.16b, v6.16b, #4
+ ushr v16.4s, v3.4s, #7
shl v3.4s, v3.4s, #25
+ orr v3.16b, v3.16b, v16.16b
+ add v1.4s, v1.4s, v3.4s
+ eor v5.16b, v5.16b, v1.16b
+ rev32 v5.8h, v5.8h
+ add v2.4s, v6.4s, v5.4s
+ zip1 v6.4s, v0.4s, v4.4s
+ zip1 v0.4s, v4.4s, v0.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v0.16b, v0.16b, v6.16b, #8
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v0.4s, v1.4s, v0.4s
+ orr v1.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v0.16b, v5.16b
ext v0.16b, v0.16b, v0.16b, #4
- ext v1.16b, v1.16b, v1.16b, #8
- ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
orr v3.16b, v3.16b, v4.16b
+ add v2.4s, v3.4s, v2.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v1.16b, v2.16b, v1.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ stp q2, q3, [x0, #32]
+ orr v1.16b, v1.16b, v4.16b
+ stp q0, q1, [x0]
+ ret
+.Lfunc_end1:
+ .size compress_pre, .Lfunc_end1-compress_pre
+ .cfi_endproc
+
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+ .cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ stp x20, x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x20, x0
+ mov x19, x5
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x20
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- eor v3.16b, v3.16b, v1.16b
- stp q0, q3, [x5]
- ldr q0, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr q0, [x20]
eor v0.16b, v0.16b, v2.16b
- str q0, [x5, #32]
- ldr q0, [x0, #16]
- eor v0.16b, v0.16b, v1.16b
- str q0, [x5, #48]
+ str q0, [x19, #32]
+ ldr q0, [x20, #16]
+ eor v0.16b, v0.16b, v3.16b
+ str q0, [x19, #48]
+ ldp x20, x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
-.Lfunc_end1:
- .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+.Lfunc_end2:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
-.LCPI2_0:
+.LCPI3_0:
.word 0
.word 1
.word 2
.type zfs_blake3_hash_many_sse2,@function
zfs_blake3_hash_many_sse2:
.cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
stp d15, d14, [sp, #-160]!
stp d13, d12, [sp, #16]
stp d11, d10, [sp, #32]
stp d9, d8, [sp, #48]
stp x29, x30, [sp, #64]
+ add x29, sp, #64
stp x28, x27, [sp, #80]
stp x26, x25, [sp, #96]
stp x24, x23, [sp, #112]
stp x22, x21, [sp, #128]
stp x20, x19, [sp, #144]
- mov x29, sp
- sub sp, sp, #384
- .cfi_def_cfa w29, 160
+ sub sp, sp, #464
+ .cfi_def_cfa w29, 96
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
.cfi_offset b13, -144
.cfi_offset b14, -152
.cfi_offset b15, -160
- ldr x26, [x29, #168]
- ldrb w27, [x29, #160]
mov w19, w6
mov x20, x4
- mov x22, x2
- mov x28, x1
+ mov x24, x1
+ ldr x26, [x29, #104]
+ ldrb w27, [x29, #96]
cmp x1, #4
- mov x24, x0
str x3, [sp, #40]
- b.lo .LBB2_8
- adrp x9, .LCPI2_0
- ldr q0, [x9, :lo12:.LCPI2_0]
- sbfx w11, w5, #0, #1
- dup v1.4s, w11
- mov w9, #58983
+ b.lo .LBB3_6
+ adrp x8, .LCPI3_0
+ sbfx w9, w5, #0, #1
mov w10, #44677
- and v0.16b, v1.16b, v0.16b
mov w11, #62322
- mov w12, #62778
- orr w8, w7, w19
- movk w9, #27145, lsl #16
movk w10, #47975, lsl #16
movk w11, #15470, lsl #16
+ ldr q0, [x8, :lo12:.LCPI3_0]
+ dup v1.4s, w9
+ mov w9, #58983
+ orr w8, w7, w19
+ movk w9, #27145, lsl #16
+ and v0.16b, v1.16b, v0.16b
+ dup v1.4s, w11
+ movi v24.4s, #64
+ dup v2.4s, w9
+ mov w9, #62778
+ movk w9, #42319, lsl #16
str q0, [sp, #16]
orr v0.4s, #128, lsl #24
- movk w12, #42319, lsl #16
+ stp q2, q1, [sp, #48]
str q0, [sp]
-.LBB2_2:
- ldr x0, [sp, #40]
- mov x13, x0
- ld1r { v20.4s }, [x13], #4
- add x14, x0, #8
- add x15, x0, #12
- add x16, x0, #16
- add x17, x0, #20
- add x18, x0, #24
- add x0, x0, #28
- ld1r { v17.4s }, [x14]
- ld1r { v6.4s }, [x15]
- ld1r { v8.4s }, [x16]
- ld1r { v9.4s }, [x17]
- ld1r { v31.4s }, [x18]
- ld1r { v26.4s }, [x13]
- ld1r { v15.4s }, [x0]
- cbz x22, .LBB2_7
+ dup v0.4s, w10
+ str q0, [sp, #80]
+ b .LBB3_3
+.LBB3_2:
+ zip1 v0.4s, v12.4s, v31.4s
+ add x10, x20, #4
+ zip1 v1.4s, v29.4s, v30.4s
+ tst w5, #0x1
+ zip1 v2.4s, v28.4s, v23.4s
+ csel x20, x10, x20, ne
+ zip1 v3.4s, v13.4s, v25.4s
+ add x0, x0, #32
+ zip2 v6.4s, v12.4s, v31.4s
+ sub x24, x24, #4
+ zip1 v4.2d, v0.2d, v1.2d
+ cmp x24, #3
+ zip2 v7.4s, v29.4s, v30.4s
+ zip1 v5.2d, v2.2d, v3.2d
+ zip2 v0.2d, v0.2d, v1.2d
+ zip2 v1.2d, v2.2d, v3.2d
+ zip2 v2.4s, v28.4s, v23.4s
+ zip2 v3.4s, v13.4s, v25.4s
+ stp q4, q5, [x26]
+ zip2 v4.2d, v6.2d, v7.2d
+ stp q0, q1, [x26, #32]
+ zip1 v0.2d, v6.2d, v7.2d
+ zip1 v1.2d, v2.2d, v3.2d
+ zip2 v2.2d, v2.2d, v3.2d
+ stp q0, q1, [x26, #64]
+ stp q4, q2, [x26, #96]
+ add x26, x26, #128
+ b.ls .LBB3_6
+.LBB3_3:
+ ldr x14, [sp, #40]
+ mov x10, x14
+ add x11, x14, #8
+ add x12, x14, #12
+ add x13, x14, #16
+ ld1r { v12.4s }, [x10], #4
+ ld1r { v29.4s }, [x11]
+ add x11, x14, #20
+ ld1r { v30.4s }, [x12]
+ add x12, x14, #24
+ ld1r { v28.4s }, [x13]
+ ld1r { v23.4s }, [x11]
+ add x11, x14, #28
+ ld1r { v13.4s }, [x12]
+ ld1r { v31.4s }, [x10]
+ ld1r { v25.4s }, [x11]
+ cbz x2, .LBB3_2
ldr q1, [sp, #16]
dup v0.4s, w20
- ldp x13, x14, [x24]
- ldp x15, x16, [x24, #16]
+ lsr x12, x20, #32
+ mov x10, xzr
+ ldp x13, x14, [x0, #16]
add v1.4s, v0.4s, v1.4s
+ mov x15, x2
movi v0.4s, #128, lsl #24
- str q1, [sp, #64]
+ mov w4, w8
+ str q1, [sp, #112]
eor v0.16b, v1.16b, v0.16b
ldr q1, [sp]
- lsr x18, x20, #32
- mov x17, xzr
cmgt v0.4s, v1.4s, v0.4s
- dup v1.4s, w18
+ dup v1.4s, w12
+ ldp x11, x12, [x0]
sub v0.4s, v1.4s, v0.4s
- mov w18, w8
- str q0, [sp, #48]
-.LBB2_4:
- mov w2, #16
- bfi x2, x17, #6, #58
- ldr q1, [x13, x2]
- ldr q3, [x14, x2]
- ldr q2, [x15, x2]
- ldr q4, [x16, x2]
- mov w2, #32
- bfi x2, x17, #6, #58
- ldr q5, [x13, x2]
- ldr q18, [x14, x2]
- ldr q19, [x15, x2]
- ldr q23, [x16, x2]
- mov w2, #48
- lsl x3, x17, #6
- bfi x2, x17, #6, #58
- add x17, x17, #1
- ldr q0, [x13, x3]
- ldr q21, [x14, x3]
- ldr q7, [x15, x3]
- ldr q16, [x16, x3]
- cmp x17, x22
- ldr q13, [x13, x2]
- ldr q14, [x14, x2]
- ldr q29, [x15, x2]
- ldr q10, [x16, x2]
- csel w2, w27, wzr, eq
- orr w18, w2, w18
- mov x0, xzr
- and w18, w18, #0xff
- add x3, x3, #256
-.LBB2_5:
- ldr x2, [x24, x0]
- add x0, x0, #8
- cmp x0, #32
- add x2, x2, x3
- prfm pldl1keep, [x2]
- b.ne .LBB2_5
- dup v22.4s, w18
- str q22, [sp, #192]
- zip1 v27.4s, v0.4s, v21.4s
- zip2 v21.4s, v0.4s, v21.4s
- zip1 v0.4s, v7.4s, v16.4s
- zip2 v22.4s, v7.4s, v16.4s
- zip1 v7.4s, v1.4s, v3.4s
- zip1 v25.4s, v2.4s, v4.4s
- zip2 v16.4s, v2.4s, v4.4s
- zip1 v11.4s, v19.4s, v23.4s
- zip2 v12.4s, v19.4s, v23.4s
- zip1 v19.4s, v13.4s, v14.4s
- zip2 v23.4s, v13.4s, v14.4s
- zip1 v13.4s, v29.4s, v10.4s
- zip2 v14.4s, v29.4s, v10.4s
- add v10.4s, v20.4s, v8.4s
- add v2.4s, v26.4s, v9.4s
- ext v20.16b, v22.16b, v21.16b, #8
- ext v26.16b, v25.16b, v7.16b, #8
- zip2 v24.4s, v1.4s, v3.4s
- add v1.4s, v6.4s, v15.4s
- ext v6.16b, v0.16b, v27.16b, #8
- ext v20.16b, v21.16b, v20.16b, #8
- mov v21.d[1], v22.d[0]
- ext v22.16b, v7.16b, v26.16b, #8
- mov v7.d[1], v25.d[0]
- add v3.4s, v17.4s, v31.4s
- str q1, [sp, #144]
- ext v1.16b, v27.16b, v6.16b, #8
- mov v6.16b, v7.16b
- zip1 v28.4s, v5.4s, v18.4s
- stur q1, [x29, #-80]
- mov v1.16b, v27.16b
- mov v27.16b, v24.16b
- add v3.4s, v3.4s, v6.4s
- ldr q6, [sp, #64]
- ext v29.16b, v16.16b, v24.16b, #8
- mov v1.d[1], v0.d[0]
- ext v0.16b, v11.16b, v28.16b, #8
- mov v27.d[1], v16.d[0]
- ext v16.16b, v14.16b, v23.16b, #8
- stur q7, [x29, #-144]
- ext v7.16b, v24.16b, v29.16b, #8
- ext v29.16b, v28.16b, v0.16b, #8
- ext v0.16b, v23.16b, v16.16b, #8
- mov v23.d[1], v14.d[0]
- stp q0, q23, [sp, #80]
- add v0.4s, v10.4s, v1.4s
- eor v16.16b, v0.16b, v6.16b
- ldr q6, [sp, #48]
- add v2.4s, v2.4s, v21.4s
- mov v28.d[1], v11.d[0]
- zip2 v18.4s, v5.4s, v18.4s
- eor v10.16b, v2.16b, v6.16b
- movi v6.4s, #64
- eor v11.16b, v3.16b, v6.16b
- ldr q6, [sp, #144]
- dup v17.4s, w9
- ext v30.16b, v12.16b, v18.16b, #8
- rev32 v16.8h, v16.8h
- dup v5.4s, w10
- ext v25.16b, v18.16b, v30.16b, #8
- mov v30.16b, v23.16b
- mov v23.16b, v1.16b
- str q1, [sp, #160]
- rev32 v10.8h, v10.8h
- add v1.4s, v16.4s, v17.4s
- add v17.4s, v6.4s, v27.4s
- ldr q6, [sp, #192]
- dup v4.4s, w11
- rev32 v11.8h, v11.8h
- add v5.4s, v10.4s, v5.4s
- eor v8.16b, v1.16b, v8.16b
- stur q21, [x29, #-128]
- mov v18.d[1], v12.d[0]
- add v4.4s, v11.4s, v4.4s
- eor v9.16b, v5.16b, v9.16b
- ushr v12.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- ldur q21, [x29, #-80]
- ext v26.16b, v13.16b, v19.16b, #8
- eor v31.16b, v4.16b, v31.16b
- orr v8.16b, v8.16b, v12.16b
- ushr v12.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- ext v26.16b, v19.16b, v26.16b, #8
- mov v19.d[1], v13.d[0]
- orr v9.16b, v9.16b, v12.16b
- ushr v12.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v17.16b, v6.16b
- orr v31.16b, v31.16b, v12.16b
- dup v12.4s, w12
- rev32 v13.8h, v13.8h
- add v12.4s, v13.4s, v12.4s
- add v0.4s, v0.4s, v21.4s
- eor v14.16b, v12.16b, v15.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v22.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v28.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v18.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v19.4s
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v30.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- mov v24.16b, v7.16b
- stur q7, [x29, #-112]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- mov v7.16b, v26.16b
- add v3.4s, v3.4s, v26.4s
- ldr q26, [sp, #80]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- add v0.4s, v0.4s, v29.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v25.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v13.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- str q22, [sp, #128]
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- ldur q22, [x29, #-128]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- eor v8.16b, v5.16b, v8.16b
- mov v6.16b, v18.16b
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- ldur q18, [x29, #-144]
- orr v8.16b, v8.16b, v15.16b
- add v0.4s, v0.4s, v22.4s
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v24.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v18.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v13.16b, v17.16b, v13.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v13.8h, v13.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v13.4s
- add v0.4s, v0.4s, v27.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v6.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v23.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v21.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v19.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v29.4s
- str q28, [sp, #112]
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldp q28, q23, [sp, #112]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ldr q21, [sp, #96]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- add v0.4s, v0.4s, v25.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v23.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v21.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v28.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v13.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- eor v8.16b, v5.16b, v8.16b
- mov v30.16b, v29.16b
- mov v29.16b, v25.16b
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- ldur q25, [x29, #-112]
- orr v8.16b, v8.16b, v15.16b
- add v0.4s, v0.4s, v20.4s
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v6.4s
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v7.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v25.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v13.16b, v17.16b, v13.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v13.8h, v13.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v13.4s
- add v0.4s, v0.4s, v18.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v19.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v22.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v21.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
+ str q0, [sp, #96]
+.LBB3_5:
+ add x17, x11, x10
+ add x21, x12, x10
+ add x16, x13, x10
+ add x6, x14, x10
+ subs x15, x15, #1
+ add x10, x10, #64
+ ldp q0, q1, [x17]
+ csel w3, w27, wzr, eq
+ orr w3, w3, w4
+ mov w4, w19
+ and w3, w3, #0xff
+ ldp q3, q6, [x21]
+ dup v2.4s, w3
+ zip1 v21.4s, v0.4s, v3.4s
+ zip2 v19.4s, v0.4s, v3.4s
+ ldp q5, q7, [x16]
+ zip1 v17.4s, v1.4s, v6.4s
+ zip2 v22.4s, v1.4s, v6.4s
+ ldp q16, q18, [x6]
+ zip1 v4.4s, v5.4s, v16.4s
+ zip2 v0.4s, v5.4s, v16.4s
+ ldp q26, q27, [x17, #32]
+ zip1 v1.4s, v7.4s, v18.4s
+ zip2 v3.4s, v7.4s, v18.4s
+ zip2 v20.2d, v19.2d, v0.2d
+ mov v19.d[1], v0.d[0]
+ dup v18.4s, w9
+ ldp q8, q9, [x21, #32]
+ stur q19, [x29, #-208]
+ zip2 v7.4s, v26.4s, v8.4s
+ zip1 v10.4s, v26.4s, v8.4s
+ ldp q11, q5, [x16, #32]
+ zip2 v26.2d, v17.2d, v1.2d
+ stp q7, q26, [sp, #192]
+ mov v17.d[1], v1.d[0]
+ add v1.4s, v23.4s, v31.4s
+ ldp q16, q6, [x6, #32]
+ stur q17, [x29, #-256]
+ add v1.4s, v1.4s, v19.4s
+ zip1 v8.4s, v11.4s, v16.4s
+ zip2 v7.4s, v11.4s, v16.4s
+ zip1 v11.4s, v27.4s, v9.4s
+ zip2 v9.4s, v27.4s, v9.4s
+ zip2 v27.2d, v21.2d, v4.2d
+ mov v21.d[1], v4.d[0]
+ str q7, [sp, #224]
+ add v4.4s, v28.4s, v12.4s
+ zip1 v15.4s, v5.4s, v6.4s
+ zip2 v14.4s, v5.4s, v6.4s
+ stur q27, [x29, #-192]
+ zip2 v16.2d, v22.2d, v3.2d
+ stp q20, q21, [x29, #-240]
+ add v0.4s, v4.4s, v21.4s
+ ldp q6, q4, [sp, #96]
+ mov v22.d[1], v3.d[0]
+ add v5.4s, v25.4s, v30.4s
+ add v3.4s, v13.4s, v29.4s
+ eor v6.16b, v1.16b, v6.16b
+ add v1.4s, v1.4s, v20.4s
+ str q22, [sp, #256]
+ eor v4.16b, v0.16b, v4.16b
+ add v5.4s, v5.4s, v22.4s
+ add v3.4s, v3.4s, v17.4s
+ ldr q17, [sp, #48]
+ rev32 v6.8h, v6.8h
+ rev32 v4.8h, v4.8h
+ eor v2.16b, v5.16b, v2.16b
+ eor v7.16b, v3.16b, v24.16b
add v0.4s, v0.4s, v27.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v30.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v29.4s
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v28.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- ldr q24, [sp, #160]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- stur q7, [x29, #-64]
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- mov v7.16b, v26.16b
- add v3.4s, v3.4s, v26.4s
- ldur q26, [x29, #-80]
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- add v0.4s, v0.4s, v23.4s
- orr v8.16b, v8.16b, v15.16b
- add v15.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v24.4s
- eor v0.16b, v15.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- ushr v13.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- orr v0.16b, v0.16b, v13.16b
- ushr v13.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v13.16b
- ushr v13.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v13.16b
- ushr v13.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v13.16b
- ushr v13.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- orr v9.16b, v9.16b, v13.16b
- ushr v13.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- add v1.4s, v10.4s, v1.4s
- orr v31.16b, v31.16b, v13.16b
- eor v13.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- ushr v14.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v14.16b
- ushr v14.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- stur q6, [x29, #-96]
- orr v8.16b, v8.16b, v14.16b
- add v14.4s, v15.4s, v6.4s
- ldur q6, [x29, #-64]
- mov v18.16b, v19.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v18.4s
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v21.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v6.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
+ add v21.4s, v4.4s, v17.4s
+ rev32 v31.8h, v2.8h
+ ldr q2, [sp, #80]
+ rev32 v7.8h, v7.8h
+ mov v27.16b, v16.16b
+ eor v17.16b, v21.16b, v28.16b
+ add v29.4s, v6.4s, v2.4s
+ ldr q2, [sp, #64]
+ add v24.4s, v31.4s, v18.4s
str q27, [sp, #176]
- mov v27.16b, v30.16b
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- add v14.4s, v14.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v27.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v20.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- mov v30.16b, v23.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v0.16b, v17.16b, v0.16b
- add v1.4s, v16.4s, v1.4s
- ldur q23, [x29, #-144]
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v0.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v23.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v29.4s
- orr v13.16b, v13.16b, v15.16b
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v30.4s
- rev32 v0.8h, v0.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v26.4s
- add v4.4s, v4.4s, v0.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldur q22, [x29, #-128]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- ldr q26, [sp, #176]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v5.4s, v11.4s
- add v14.4s, v14.4s, v24.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v22.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v28.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v30.4s, v7.4s, v2.4s
+ eor v18.16b, v29.16b, v23.16b
+ orr v12.16b, v17.16b, v19.16b
+ eor v17.16b, v30.16b, v13.16b
+ eor v19.16b, v24.16b, v25.16b
+ ushr v23.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ ushr v25.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ ushr v28.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v13.16b, v18.16b, v23.16b
+ orr v25.16b, v17.16b, v25.16b
+ orr v2.16b, v19.16b, v28.16b
+ add v28.4s, v0.4s, v12.4s
+ add v0.4s, v3.4s, v26.4s
+ add v18.4s, v1.4s, v13.4s
+ add v3.4s, v5.4s, v16.4s
+ eor v1.16b, v28.16b, v4.16b
+ add v17.4s, v0.4s, v25.4s
+ eor v0.16b, v18.16b, v6.16b
+ add v19.4s, v3.4s, v2.4s
+ ushr v16.4s, v1.4s, #8
+ shl v3.4s, v1.4s, #24
+ eor v4.16b, v17.16b, v7.16b
+ ushr v6.4s, v0.4s, #8
+ shl v1.4s, v0.4s, #24
+ eor v5.16b, v19.16b, v31.16b
+ ushr v23.4s, v4.4s, #8
+ shl v4.4s, v4.4s, #24
+ orr v7.16b, v3.16b, v16.16b
+ orr v6.16b, v1.16b, v6.16b
+ ushr v31.4s, v5.4s, #8
+ shl v0.4s, v5.4s, #24
+ orr v5.16b, v4.16b, v23.16b
+ add v4.4s, v7.4s, v21.4s
+ ldr q21, [sp, #192]
+ add v3.4s, v6.4s, v29.4s
+ orr v31.16b, v0.16b, v31.16b
+ add v23.4s, v5.4s, v30.4s
+ eor v0.16b, v4.16b, v12.16b
+ eor v1.16b, v3.16b, v13.16b
+ add v16.4s, v31.4s, v24.4s
+ eor v20.16b, v23.16b, v25.16b
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v29.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v30.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v25.16b, v0.16b, v24.16b
+ orr v0.16b, v1.16b, v29.16b
+ mov v29.16b, v10.16b
+ orr v1.16b, v20.16b, v30.16b
+ mov v20.16b, v10.16b
+ mov v24.16b, v21.16b
+ ldr q20, [sp, #224]
+ mov v29.d[1], v8.d[0]
+ mov v13.16b, v9.16b
+ zip2 v30.2d, v10.2d, v8.2d
+ zip2 v8.2d, v21.2d, v20.2d
+ mov v26.16b, v11.16b
+ mov v24.d[1], v20.d[0]
+ add v20.4s, v28.4s, v29.4s
+ mov v13.d[1], v14.d[0]
+ str q8, [sp, #128]
+ eor v2.16b, v16.16b, v2.16b
+ mov v26.d[1], v15.d[0]
+ str q24, [sp, #192]
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v13.4s
+ ushr v12.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ zip2 v10.2d, v9.2d, v14.2d
+ add v18.4s, v18.4s, v24.4s
add v17.4s, v17.4s, v26.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v15.16b
- add v14.4s, v14.4s, v18.4s
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v27.4s
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v7.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
+ mov v14.16b, v26.16b
+ eor v26.16b, v20.16b, v31.16b
+ stp q10, q30, [sp, #224]
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v12.16b
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v30.4s
+ zip2 v21.2d, v11.2d, v15.2d
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v10.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v8.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
add v17.4s, v17.4s, v21.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- add v14.4s, v14.4s, v6.4s
- ldur q6, [x29, #-96]
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
- stur q20, [x29, #-160]
- mov v20.16b, v29.16b
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- mov v19.16b, v29.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- mov v19.16b, v28.16b
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ ldp q28, q12, [x29, #-256]
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v12.4s
+ mov v15.16b, v29.16b
+ ldur q29, [x29, #-208]
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ str q15, [sp, #160]
+ add v20.4s, v20.4s, v29.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v27.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v28.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
add v3.4s, v3.4s, v6.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v19.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v24.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v22.4s
+ add v18.4s, v18.4s, v0.4s
+ mov v9.16b, v30.16b
+ mov v30.16b, v21.16b
+ ldur q21, [x29, #-224]
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ str q30, [sp, #144]
+ add v17.4s, v17.4s, v21.4s
+ ldur q21, [x29, #-192]
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v30.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v21.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v10.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v9.4s
+ ldr q9, [sp, #208]
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v8.4s
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v15.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v9.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
add v17.4s, v17.4s, v13.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v0.16b, v17.16b, v0.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v0.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v25.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v30.4s
- orr v13.16b, v13.16b, v15.16b
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v24.4s
- rev32 v0.8h, v0.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v26.4s
- mov v29.16b, v27.16b
- add v4.4s, v4.4s, v0.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldur q27, [x29, #-160]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ldur q6, [x29, #-80]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v5.4s, v11.4s
- add v14.4s, v14.4s, v22.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v27.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v24.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v12.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v30.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v27.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
add v3.4s, v3.4s, v6.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v23.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v15.16b
- add v14.4s, v14.4s, v29.4s
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- mov v28.16b, v7.16b
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v19.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v28.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v14.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v28.4s
+ add v18.4s, v18.4s, v0.4s
+ mov v10.16b, v13.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v29.4s
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v10.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ mov v22.16b, v8.16b
+ ldp q8, q28, [sp, #240]
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v28.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v15.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v8.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v22.4s
+ ldur q22, [x29, #-256]
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v9.4s
+ mov v13.16b, v12.16b
+ mov v12.16b, v27.16b
+ mov v27.16b, v9.16b
+ ldur q9, [x29, #-192]
+ mov v21.16b, v15.16b
+ ldr q15, [sp, #224]
+ ushr v11.4s, v1.4s, #12
+ ldur q21, [x29, #-224]
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v9.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v21.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v15.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v24.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v10.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v30.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ add v3.4s, v3.4s, v6.4s
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v8.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v12.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- add v14.4s, v14.4s, v21.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v30.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- add v3.4s, v3.4s, v18.4s
- orr v10.16b, v10.16b, v15.16b
- add v15.4s, v3.4s, v31.4s
- eor v3.16b, v15.16b, v11.16b
- ushr v11.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v11.16b, v3.16b, v11.16b
- add v3.4s, v17.4s, v6.4s
- add v17.4s, v3.4s, v13.4s
- eor v0.16b, v17.16b, v0.16b
- ushr v3.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- add v1.4s, v16.4s, v1.4s
- orr v0.16b, v0.16b, v3.16b
- eor v3.16b, v1.16b, v8.16b
- ushr v8.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- add v5.4s, v10.4s, v5.4s
- orr v8.16b, v3.16b, v8.16b
- eor v3.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- ushr v9.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- eor v31.16b, v4.16b, v31.16b
- mov v7.16b, v23.16b
- mov v23.16b, v28.16b
- mov v28.16b, v6.16b
- orr v3.16b, v3.16b, v9.16b
- ushr v9.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- ldur q6, [x29, #-64]
- orr v31.16b, v31.16b, v9.16b
- add v9.4s, v0.4s, v12.4s
- eor v12.16b, v9.16b, v13.16b
- ushr v13.4s, v12.4s, #7
- shl v12.4s, v12.4s, #25
- orr v12.16b, v12.16b, v13.16b
- add v13.4s, v14.4s, v6.4s
- add v13.4s, v13.4s, v3.4s
- eor v0.16b, v13.16b, v0.16b
- add v2.4s, v2.4s, v24.4s
- rev32 v14.8h, v0.8h
- add v0.4s, v2.4s, v31.4s
- add v6.4s, v4.4s, v14.4s
- eor v2.16b, v0.16b, v16.16b
- eor v3.16b, v6.16b, v3.16b
- rev32 v16.8h, v2.8h
- ushr v4.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v2.4s, v9.4s, v16.4s
- orr v4.16b, v3.16b, v4.16b
- eor v3.16b, v2.16b, v31.16b
- ushr v31.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v31.16b
- add v31.4s, v15.4s, v22.4s
- add v31.4s, v31.4s, v12.4s
- add v17.4s, v17.4s, v7.4s
- eor v9.16b, v31.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- rev32 v9.8h, v9.8h
- eor v11.16b, v17.16b, v11.16b
- add v1.4s, v1.4s, v9.4s
- rev32 v11.8h, v11.8h
- eor v10.16b, v1.16b, v12.16b
- add v5.4s, v5.4s, v11.4s
- ushr v12.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v8.16b, v5.16b, v8.16b
- orr v10.16b, v10.16b, v12.16b
- ushr v12.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- orr v8.16b, v8.16b, v12.16b
- add v12.4s, v13.4s, v27.4s
- add v12.4s, v12.4s, v4.4s
- eor v13.16b, v12.16b, v14.16b
- ldur q14, [x29, #-96]
- mov v25.16b, v29.16b
- add v29.4s, v12.4s, v20.4s
- add v20.4s, v31.4s, v26.4s
- add v0.4s, v0.4s, v14.4s
- add v0.4s, v0.4s, v3.4s
- eor v16.16b, v0.16b, v16.16b
- add v0.4s, v0.4s, v30.4s
- ldur q30, [x29, #-112]
+ ldr q13, [sp, #160]
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v15.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v22.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ mov v29.16b, v14.16b
+ ldr q14, [sp, #128]
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v27.4s
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v21.4s
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v28.4s
+ add v20.4s, v20.4s, v0.4s
+ mov v12.16b, v27.16b
+ ldur q27, [x29, #-208]
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v27.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v8.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v29.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v15.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v10.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ add v3.4s, v3.4s, v6.4s
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v14.4s
+ mov v30.16b, v29.16b
+ mov v29.16b, v15.16b
+ ldr q15, [sp, #144]
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v15.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v24.4s
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v13.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ mov v9.16b, v28.16b
+ mov v28.16b, v10.16b
+ ldr q10, [sp, #176]
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
add v20.4s, v20.4s, v10.4s
- eor v31.16b, v20.16b, v9.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v12.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ add v20.4s, v20.4s, v27.4s
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v22.4s
+ mov v9.16b, v22.16b
+ ldur q22, [x29, #-240]
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v25.4s
+ mov v24.16b, v21.16b
+ ldur q21, [x29, #-192]
+ orr v2.16b, v2.16b, v11.16b
+ eor v26.16b, v20.16b, v26.16b
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ add v17.4s, v17.4s, v2.4s
+ shl v26.4s, v26.4s, #24
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ orr v6.16b, v6.16b, v11.16b
+ eor v0.16b, v23.16b, v0.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v6.4s, v4.4s
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v8.4s
+ add v18.4s, v18.4s, v14.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v13.4s
+ add v18.4s, v18.4s, v0.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v29.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v18.16b
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ rev32 v6.8h, v6.8h
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ add v3.4s, v3.4s, v6.4s
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v0.16b, v0.16b, v11.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
add v20.4s, v20.4s, v28.4s
+ add v18.4s, v18.4s, v12.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
add v17.4s, v17.4s, v30.4s
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v17.16b, v11.16b
- ushr v28.4s, v13.4s, #8
- shl v11.4s, v13.4s, #24
- orr v28.16b, v11.16b, v28.16b
- ushr v11.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v16.16b, v16.16b, v11.16b
- ushr v11.4s, v31.4s, #8
- shl v31.4s, v31.4s, #24
- add v6.4s, v28.4s, v6.4s
- orr v31.16b, v31.16b, v11.16b
- ushr v11.4s, v9.4s, #8
- shl v9.4s, v9.4s, #24
- add v2.4s, v16.4s, v2.4s
- eor v4.16b, v6.16b, v4.16b
- orr v9.16b, v9.16b, v11.16b
- add v1.4s, v31.4s, v1.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v11.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v5.4s, v9.4s, v5.4s
- eor v10.16b, v1.16b, v10.16b
- orr v4.16b, v4.16b, v11.16b
- ushr v11.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v3.16b, v3.16b, v11.16b
- ushr v11.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- orr v10.16b, v10.16b, v11.16b
- ushr v11.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v11.16b
- add v29.4s, v29.4s, v8.4s
- eor v16.16b, v29.16b, v16.16b
- add v0.4s, v0.4s, v4.4s
- mov v12.16b, v26.16b
- add v17.4s, v17.4s, v19.4s
- add v26.4s, v29.4s, v23.4s
- eor v29.16b, v0.16b, v31.16b
- add v20.4s, v20.4s, v3.4s
- rev32 v16.8h, v16.8h
- stur q18, [x29, #-176]
- mov v18.16b, v27.16b
- add v0.4s, v0.4s, v24.4s
- eor v27.16b, v20.16b, v9.16b
- add v17.4s, v17.4s, v10.4s
- rev32 v24.8h, v29.8h
- add v1.4s, v1.4s, v16.4s
+ add v18.4s, v18.4s, v0.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v21.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v18.16b, v6.16b
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v7.16b, v7.16b, v31.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v6.16b, v6.16b, v11.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ add v3.4s, v6.4s, v3.4s
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v0.16b, v0.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v15.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v24.4s
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ mov v8.16b, v13.16b
+ ldur q13, [x29, #-208]
+ orr v2.16b, v2.16b, v11.16b
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v13.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v17.4s, v17.4s, v2.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ rev32 v7.8h, v7.8h
+ eor v6.16b, v17.16b, v6.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ add v16.4s, v16.4s, v7.4s
+ rev32 v6.8h, v6.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ eor v1.16b, v16.16b, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v0.16b, v0.16b, v31.16b
+ eor v2.16b, v4.16b, v2.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v20.4s, v20.4s, v22.4s
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v10.4s
+ mov v27.16b, v12.16b
+ mov v12.16b, v30.16b
+ mov v29.16b, v21.16b
+ mov v21.16b, v24.16b
+ ldr q24, [sp, #192]
+ mov v30.16b, v22.16b
+ ldr q22, [sp, #256]
+ orr v2.16b, v2.16b, v11.16b
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v24.4s
+ add v19.4s, v19.4s, v25.4s
+ add v17.4s, v17.4s, v22.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v26.16b, v26.16b, v31.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ orr v6.16b, v6.16b, v11.16b
+ eor v0.16b, v23.16b, v0.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v6.4s, v4.4s
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ orr v0.16b, v0.16b, v31.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v20.4s, v20.4s, v14.4s
+ add v18.4s, v18.4s, v27.4s
+ ldr q27, [sp, #224]
+ orr v1.16b, v1.16b, v31.16b
+ orr v2.16b, v2.16b, v11.16b
add v20.4s, v20.4s, v25.4s
- eor v25.16b, v17.16b, v28.16b
- rev32 v27.8h, v27.8h
- add v5.4s, v5.4s, v24.4s
- eor v28.16b, v1.16b, v8.16b
- rev32 v25.8h, v25.8h
- add v6.4s, v6.4s, v27.4s
- eor v4.16b, v5.16b, v4.16b
- ushr v31.4s, v28.4s, #12
- shl v28.4s, v28.4s, #20
- add v2.4s, v2.4s, v25.4s
- eor v3.16b, v6.16b, v3.16b
- orr v28.16b, v28.16b, v31.16b
- ushr v31.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- eor v29.16b, v2.16b, v10.16b
- orr v4.16b, v4.16b, v31.16b
- ushr v31.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v26.4s, v26.4s, v28.4s
- orr v3.16b, v3.16b, v31.16b
- ushr v31.4s, v29.4s, #12
- shl v29.4s, v29.4s, #20
- eor v16.16b, v26.16b, v16.16b
- add v0.4s, v0.4s, v4.4s
- add v17.4s, v17.4s, v12.4s
- orr v29.16b, v29.16b, v31.16b
- eor v24.16b, v0.16b, v24.16b
- add v0.4s, v0.4s, v22.4s
- add v20.4s, v20.4s, v3.4s
- ushr v22.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- add v23.4s, v26.4s, v21.4s
- eor v21.16b, v20.16b, v27.16b
add v17.4s, v17.4s, v29.4s
- orr v16.16b, v16.16b, v22.16b
- ushr v22.4s, v24.4s, #8
- shl v24.4s, v24.4s, #24
- eor v25.16b, v17.16b, v25.16b
- orr v22.16b, v24.16b, v22.16b
+ add v18.4s, v18.4s, v0.4s
+ add v19.4s, v19.4s, v8.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v18.16b
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ rev32 v6.8h, v6.8h
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ add v3.4s, v3.4s, v6.4s
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v16.4s, v26.4s
+ ushr v29.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v25.16b, v25.16b, v29.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v29.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v31.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v21.4s
+ ldr q21, [sp, #240]
+ add v20.4s, v20.4s, v27.4s
+ prfm pldl1keep, [x17, #256]
+ orr v1.16b, v1.16b, v29.16b
+ prfm pldl1keep, [x21, #256]
+ orr v2.16b, v2.16b, v31.16b
+ prfm pldl1keep, [x16, #256]
+ add v18.4s, v18.4s, v0.4s
+ prfm pldl1keep, [x6, #256]
+ add v17.4s, v17.4s, v21.4s
+ add v19.4s, v19.4s, v22.4s
+ add v20.4s, v20.4s, v25.4s
+ eor v6.16b, v18.16b, v6.16b
+ add v17.4s, v17.4s, v1.4s
+ add v19.4s, v19.4s, v2.4s
+ eor v7.16b, v20.16b, v7.16b
+ ushr v22.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ eor v26.16b, v19.16b, v26.16b
+ ushr v21.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ orr v6.16b, v6.16b, v22.16b
+ ushr v22.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ ushr v29.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ orr v7.16b, v7.16b, v21.16b
+ orr v5.16b, v5.16b, v22.16b
+ add v3.4s, v6.4s, v3.4s
+ orr v21.16b, v26.16b, v29.16b
+ add v4.4s, v7.4s, v4.4s
+ add v22.4s, v5.4s, v23.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v21.4s, v16.4s
+ eor v23.16b, v4.16b, v25.16b
+ eor v1.16b, v22.16b, v1.16b
+ ushr v25.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v2.16b, v16.16b, v2.16b
+ ushr v26.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ orr v0.16b, v0.16b, v25.16b
+ ushr v25.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v29.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v20.4s, v20.4s, v28.4s
+ orr v23.16b, v23.16b, v26.16b
+ orr v1.16b, v1.16b, v25.16b
+ orr v2.16b, v2.16b, v29.16b
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v13.4s
+ add v17.4s, v17.4s, v30.4s
+ add v19.4s, v19.4s, v10.4s
+ eor v21.16b, v20.16b, v21.16b
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v2.4s
+ add v19.4s, v19.4s, v23.4s
+ rev32 v21.8h, v21.8h
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ eor v5.16b, v19.16b, v5.16b
+ add v22.4s, v22.4s, v21.4s
+ rev32 v7.8h, v7.8h
+ rev32 v6.8h, v6.8h
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v22.16b, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ add v4.4s, v4.4s, v6.4s
+ add v3.4s, v3.4s, v5.4s
+ ushr v25.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ eor v23.16b, v3.16b, v23.16b
+ orr v0.16b, v0.16b, v25.16b
+ ushr v25.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v26.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ushr v27.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v1.16b, v1.16b, v25.16b
+ add v20.4s, v20.4s, v24.4s
+ orr v2.16b, v2.16b, v26.16b
+ orr v23.16b, v23.16b, v27.16b
+ add v18.4s, v18.4s, v12.4s
+ add v17.4s, v17.4s, v9.4s
+ add v19.4s, v19.4s, v15.4s
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v2.4s
+ add v19.4s, v19.4s, v23.4s
+ eor v21.16b, v20.16b, v21.16b
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ eor v5.16b, v19.16b, v5.16b
ushr v24.4s, v21.4s, #8
shl v21.4s, v21.4s, #24
+ ushr v25.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v26.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ ushr v27.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
orr v21.16b, v21.16b, v24.16b
- ushr v24.4s, v25.4s, #8
- shl v25.4s, v25.4s, #24
- add v1.4s, v16.4s, v1.4s
- orr v24.16b, v25.16b, v24.16b
- add v5.4s, v22.4s, v5.4s
- eor v25.16b, v1.16b, v28.16b
- add v6.4s, v21.4s, v6.4s
- eor v4.16b, v5.16b, v4.16b
- ushr v27.4s, v25.4s, #7
- shl v25.4s, v25.4s, #25
- add v2.4s, v24.4s, v2.4s
- eor v3.16b, v6.16b, v3.16b
- orr v25.16b, v25.16b, v27.16b
- ushr v27.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- ldur q19, [x29, #-176]
- eor v26.16b, v2.16b, v29.16b
- orr v4.16b, v4.16b, v27.16b
- ushr v27.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- orr v3.16b, v3.16b, v27.16b
- ushr v27.4s, v26.4s, #7
- shl v26.4s, v26.4s, #25
- add v20.4s, v20.4s, v18.4s
- add v17.4s, v17.4s, v30.4s
- orr v26.16b, v26.16b, v27.16b
- add v0.4s, v0.4s, v3.4s
- eor v16.16b, v0.16b, v16.16b
- add v0.4s, v0.4s, v19.4s
- add v19.4s, v20.4s, v26.4s
- add v17.4s, v17.4s, v25.4s
- eor v20.16b, v19.16b, v22.16b
- add v7.4s, v19.4s, v7.4s
- eor v19.16b, v17.16b, v21.16b
- ldur q21, [x29, #-64]
- add v23.4s, v23.4s, v4.4s
- eor v24.16b, v23.16b, v24.16b
- rev32 v16.8h, v16.8h
- add v17.4s, v17.4s, v21.4s
- rev32 v21.8h, v24.8h
- add v6.4s, v6.4s, v21.4s
- rev32 v20.8h, v20.8h
- add v2.4s, v2.4s, v16.4s
- eor v4.16b, v6.16b, v4.16b
- rev32 v19.8h, v19.8h
- add v1.4s, v1.4s, v20.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v24.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v5.4s, v5.4s, v19.4s
- eor v22.16b, v1.16b, v26.16b
- orr v4.16b, v4.16b, v24.16b
- ushr v24.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v18.4s, v23.4s, v14.4s
- eor v23.16b, v5.16b, v25.16b
- orr v3.16b, v3.16b, v24.16b
- ushr v24.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- orr v22.16b, v22.16b, v24.16b
- ushr v24.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v24.16b
- add v18.4s, v18.4s, v4.4s
- add v0.4s, v0.4s, v3.4s
- add v24.4s, v17.4s, v23.4s
- eor v17.16b, v18.16b, v21.16b
- add v7.4s, v7.4s, v22.4s
- eor v16.16b, v0.16b, v16.16b
- ushr v21.4s, v17.4s, #8
- shl v17.4s, v17.4s, #24
- eor v20.16b, v7.16b, v20.16b
- orr v21.16b, v17.16b, v21.16b
- ushr v17.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v19.16b, v24.16b, v19.16b
- orr v16.16b, v16.16b, v17.16b
- ushr v17.4s, v20.4s, #8
- shl v20.4s, v20.4s, #24
- orr v25.16b, v20.16b, v17.16b
- ushr v17.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v17.16b
- add v1.4s, v25.4s, v1.4s
- eor v22.16b, v1.16b, v22.16b
- eor v20.16b, v1.16b, v18.16b
- add v1.4s, v19.4s, v5.4s
- eor v26.16b, v1.16b, v0.16b
- add v0.4s, v21.4s, v6.4s
- eor v5.16b, v1.16b, v23.16b
- eor v1.16b, v0.16b, v4.16b
- eor v17.16b, v0.16b, v7.16b
- add v0.4s, v16.4s, v2.4s
- eor v2.16b, v0.16b, v3.16b
- eor v6.16b, v0.16b, v24.16b
- ushr v0.4s, v1.4s, #7
+ orr v7.16b, v7.16b, v25.16b
+ orr v6.16b, v6.16b, v26.16b
+ orr v5.16b, v5.16b, v27.16b
+ add v22.4s, v21.4s, v22.4s
+ add v16.4s, v7.4s, v16.4s
+ add v4.4s, v6.4s, v4.4s
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v22.16b, v0.16b
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ eor v23.16b, v3.16b, v23.16b
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v25.4s, v1.4s, #7
shl v1.4s, v1.4s, #25
- orr v0.16b, v1.16b, v0.16b
- ushr v1.4s, v2.4s, #7
+ ushr v26.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- orr v1.16b, v2.16b, v1.16b
- ushr v2.4s, v22.4s, #7
- shl v3.4s, v22.4s, #25
- orr v2.16b, v3.16b, v2.16b
- ushr v3.4s, v5.4s, #7
- shl v4.4s, v5.4s, #25
- orr v3.16b, v4.16b, v3.16b
- eor v8.16b, v16.16b, v3.16b
- eor v9.16b, v25.16b, v0.16b
- eor v31.16b, v1.16b, v19.16b
- cmp x17, x22
- eor v15.16b, v2.16b, v21.16b
- mov w18, w19
- b.ne .LBB2_4
-.LBB2_7:
- zip1 v0.4s, v20.4s, v26.4s
- zip2 v1.4s, v20.4s, v26.4s
- zip1 v2.4s, v17.4s, v6.4s
- zip2 v3.4s, v17.4s, v6.4s
- zip1 v4.4s, v8.4s, v9.4s
- zip2 v5.4s, v8.4s, v9.4s
- zip1 v6.4s, v31.4s, v15.4s
- zip2 v7.4s, v31.4s, v15.4s
- add x13, x20, #4
- tst w5, #0x1
- sub x28, x28, #4
- zip1 v16.2d, v0.2d, v2.2d
- zip2 v0.2d, v0.2d, v2.2d
- zip1 v2.2d, v1.2d, v3.2d
- zip2 v1.2d, v1.2d, v3.2d
- zip1 v3.2d, v4.2d, v6.2d
- zip2 v4.2d, v4.2d, v6.2d
- zip1 v6.2d, v5.2d, v7.2d
- zip2 v5.2d, v5.2d, v7.2d
- add x24, x24, #32
- csel x20, x13, x20, ne
- cmp x28, #3
- stp q16, q3, [x26]
- stp q0, q4, [x26, #32]
- stp q2, q6, [x26, #64]
- stp q1, q5, [x26, #96]
- add x26, x26, #128
- b.hi .LBB2_2
-.LBB2_8:
- cbz x28, .LBB2_16
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ orr v0.16b, v0.16b, v24.16b
+ orr v1.16b, v1.16b, v25.16b
+ orr v2.16b, v2.16b, v26.16b
+ orr v23.16b, v23.16b, v27.16b
+ movi v24.4s, #64
+ eor v12.16b, v4.16b, v20.16b
+ eor v31.16b, v18.16b, v3.16b
+ eor v29.16b, v17.16b, v22.16b
+ eor v30.16b, v16.16b, v19.16b
+ eor v28.16b, v7.16b, v23.16b
+ eor v23.16b, v6.16b, v0.16b
+ eor v13.16b, v1.16b, v5.16b
+ eor v25.16b, v2.16b, v21.16b
+ cbnz x15, .LBB3_5
+ b .LBB3_2
+.LBB3_6:
+ cbz x24, .LBB3_14
orr w8, w7, w19
- and x21, x5, #0x1
- stur w8, [x29, #-64]
-.LBB2_10:
+ and x22, x5, #0x1
+ stur w8, [x29, #-192]
+.LBB3_8:
ldr x8, [sp, #40]
- ldr x25, [x24]
- ldur w4, [x29, #-64]
- ldp q1, q0, [x8]
- mov x8, x22
- stp q1, q0, [x29, #-48]
-.LBB2_11:
- subs x23, x8, #1
- b.eq .LBB2_13
- cbnz x8, .LBB2_14
- b .LBB2_15
-.LBB2_13:
- orr w4, w4, w27
-.LBB2_14:
- sub x0, x29, #48
- mov w2, #64
- mov x1, x25
- mov x3, x20
- bl zfs_blake3_compress_in_place_sse2
+ mov x28, x0
+ ldr x25, [x0]
+ mov x23, x2
+ ldur w5, [x29, #-192]
+ ldp q0, q1, [x8]
+ mov x8, x2
+ b .LBB3_11
+.LBB3_9:
+ orr w5, w5, w27
+.LBB3_10:
+ sub x0, x29, #144
+ sub x1, x29, #176
+ mov x2, x25
+ mov w3, #64
+ mov x4, x20
+ bl compress_pre
+ ldp q0, q1, [x29, #-144]
add x25, x25, #64
- mov x8, x23
- mov w4, w19
- b .LBB2_11
-.LBB2_15:
- ldp q0, q1, [x29, #-48]
- add x20, x20, x21
- add x24, x24, #8
- subs x28, x28, #1
- stp q0, q1, [x26], #32
- b.ne .LBB2_10
-.LBB2_16:
- add sp, sp, #384
+ mov x8, x21
+ mov w5, w19
+ ldp q2, q3, [x29, #-112]
+ eor v0.16b, v2.16b, v0.16b
+ eor v1.16b, v3.16b, v1.16b
+.LBB3_11:
+ subs x21, x8, #1
+ stp q0, q1, [x29, #-176]
+ b.eq .LBB3_9
+ cbnz x8, .LBB3_10
+ ldp q1, q0, [x29, #-176]
+ mov x0, x28
+ add x20, x20, x22
+ add x0, x28, #8
+ subs x24, x24, #1
+ mov x2, x23
+ stp q1, q0, [x26], #32
+ b.ne .LBB3_8
+.LBB3_14:
+ add sp, sp, #464
ldp x20, x19, [sp, #144]
ldp x22, x21, [sp, #128]
ldp x24, x23, [sp, #112]
ldp d11, d10, [sp, #32]
ldp d13, d12, [sp, #16]
ldp d15, d14, [sp], #160
+ hint #29
ret
-.Lfunc_end2:
- .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+.Lfunc_end3:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2
.cfi_endproc
.section ".note.GNU-stack","",@progbits
-#endif
+#endif
\ No newline at end of file
/*
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
* Copyright (c) 2019-2022 Samuel Neves
- * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de>
*
* This is converted assembly: SSE4.1 -> ARMv8-A
* Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
*/
#if defined(__aarch64__)
.text
+ .section .note.gnu.property,"a",@note
+ .p2align 3
+ .word 4
+ .word 16
+ .word 5
+ .asciz "GNU"
+ .word 3221225472
+ .word 4
+ .word 3
+ .word 0
+.Lsec_end0:
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+ .cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ str x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x19, x0
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x19
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
+ eor v0.16b, v2.16b, v0.16b
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+ .cfi_endproc
+
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
-.LCPI0_0:
+.LCPI1_0:
+ .xword -4942790177982912921
+ .xword -6534734903820487822
+.LCPI1_1:
.byte 2
.byte 3
.byte 0
.byte 15
.byte 12
.byte 13
-.LCPI0_1:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI0_2:
+.LCPI1_2:
.byte 1
.byte 2
.byte 3
.byte 14
.byte 15
.byte 12
-.LCPI0_3:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 20
- .byte 21
- .byte 22
- .byte 23
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
-.LCPI0_4:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 4
- .byte 5
- .byte 6
- .byte 7
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
.text
- .globl zfs_blake3_compress_in_place_sse41
.p2align 2
- .type zfs_blake3_compress_in_place_sse41,@function
-zfs_blake3_compress_in_place_sse41:
+ .type compress_pre,@function
+compress_pre:
.cfi_startproc
- ldp q7, q6, [x0]
- ldp q17, q18, [x1]
- add x12, x1, #32
- ld2 { v4.4s, v5.4s }, [x12]
- lsr x10, x3, #32
- fmov s16, w3
- adrp x13, .LCPI0_0
- adrp x11, .LCPI0_1
- and w8, w2, #0xff
- mov v16.s[1], w10
- ldr q0, [x13, :lo12:.LCPI0_0]
- ldr q20, [x11, :lo12:.LCPI0_1]
- adrp x11, .LCPI0_4
- and w9, w4, #0xff
- ldr q2, [x11, :lo12:.LCPI0_4]
- mov v16.s[2], w8
- uzp1 v21.4s, v17.4s, v18.4s
- add v7.4s, v6.4s, v7.4s
- adrp x12, .LCPI0_3
- mov v16.s[3], w9
- uzp2 v18.4s, v17.4s, v18.4s
- add v7.4s, v7.4s, v21.4s
- ext v17.16b, v5.16b, v5.16b, #12
- ldr q3, [x12, :lo12:.LCPI0_3]
- ext v24.16b, v4.16b, v4.16b, #12
- eor v16.16b, v7.16b, v16.16b
- mov v27.16b, v17.16b
- uzp1 v19.4s, v21.4s, v21.4s
- ext v25.16b, v21.16b, v21.16b, #12
- zip2 v28.4s, v18.4s, v17.4s
- tbl v29.16b, { v16.16b }, v0.16b
- mov v27.s[1], v24.s[2]
- zip1 v23.2d, v17.2d, v18.2d
- ext v19.16b, v19.16b, v21.16b, #8
- add v22.4s, v29.4s, v20.4s
- ext v26.16b, v21.16b, v25.16b, #12
- tbl v20.16b, { v23.16b, v24.16b }, v2.16b
- zip1 v21.4s, v28.4s, v24.4s
- zip1 v23.4s, v24.4s, v28.4s
- uzp2 v19.4s, v19.4s, v18.4s
- eor v24.16b, v22.16b, v6.16b
- ext v25.16b, v20.16b, v20.16b, #12
- ext v6.16b, v23.16b, v21.16b, #8
- add v7.4s, v7.4s, v18.4s
- ext v18.16b, v19.16b, v19.16b, #4
- tbl v16.16b, { v26.16b, v27.16b }, v3.16b
- uzp1 v21.4s, v20.4s, v25.4s
- mov v26.16b, v6.16b
- ext v23.16b, v18.16b, v18.16b, #12
- mov v26.s[1], v21.s[2]
- adrp x10, .LCPI0_2
- ext v25.16b, v18.16b, v23.16b, #12
- uzp1 v23.4s, v18.4s, v18.4s
- ldr q1, [x10, :lo12:.LCPI0_2]
- ext v18.16b, v23.16b, v18.16b, #8
- ushr v23.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- orr v23.16b, v24.16b, v23.16b
- add v7.4s, v7.4s, v23.4s
- eor v27.16b, v29.16b, v7.16b
- add v4.4s, v7.4s, v4.4s
- tbl v7.16b, { v25.16b, v26.16b }, v3.16b
- tbl v26.16b, { v27.16b }, v1.16b
- add v22.4s, v22.4s, v26.4s
- uzp2 v18.4s, v18.4s, v16.4s
- eor v23.16b, v23.16b, v22.16b
- ext v5.16b, v18.16b, v18.16b, #4
- ushr v27.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- uzp1 v25.4s, v5.4s, v5.4s
- orr v23.16b, v23.16b, v27.16b
- ext v28.16b, v4.16b, v4.16b, #12
- ext v4.16b, v25.16b, v5.16b, #8
- ext v25.16b, v26.16b, v26.16b, #8
- add v26.4s, v28.4s, v23.4s
- eor v25.16b, v26.16b, v25.16b
- ext v22.16b, v22.16b, v22.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v22.4s, v22.4s, v25.4s
- eor v23.16b, v23.16b, v22.16b
- add v17.4s, v26.4s, v17.4s
- ushr v26.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v26.16b
- add v17.4s, v17.4s, v23.4s
- eor v25.16b, v25.16b, v17.16b
- add v17.4s, v17.4s, v19.4s
- tbl v19.16b, { v25.16b }, v1.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- ext v17.16b, v17.16b, v17.16b, #4
- orr v23.16b, v23.16b, v25.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v17.4s, v17.4s, v23.4s
- eor v19.16b, v17.16b, v19.16b
- ext v22.16b, v22.16b, v22.16b, #12
- tbl v19.16b, { v19.16b }, v0.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v17.4s, v17.4s, v16.4s
- orr v23.16b, v23.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- ext v25.16b, v17.16b, v17.16b, #12
- eor v17.16b, v19.16b, v17.16b
- tbl v17.16b, { v17.16b }, v1.16b
- add v19.4s, v22.4s, v17.4s
- eor v22.16b, v23.16b, v19.16b
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v6.2d, v16.2d
- ushr v23.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- zip2 v24.4s, v16.4s, v6.4s
- tbl v26.16b, { v20.16b, v21.16b }, v2.16b
- orr v22.16b, v22.16b, v23.16b
- zip1 v16.4s, v24.4s, v21.4s
- zip1 v20.4s, v21.4s, v24.4s
- ext v21.16b, v26.16b, v26.16b, #12
- ext v17.16b, v17.16b, v17.16b, #8
- add v25.4s, v25.4s, v22.4s
- ext v16.16b, v20.16b, v16.16b, #8
- uzp1 v21.4s, v26.4s, v21.4s
- eor v26.16b, v25.16b, v17.16b
- ext v19.16b, v19.16b, v19.16b, #4
- tbl v26.16b, { v26.16b }, v0.16b
- mov v29.16b, v16.16b
- add v19.4s, v19.4s, v26.4s
- ext v27.16b, v5.16b, v5.16b, #12
- mov v29.s[1], v21.s[2]
- eor v22.16b, v22.16b, v19.16b
- ext v28.16b, v5.16b, v27.16b, #12
- ushr v27.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v25.4s, v6.4s
- orr v22.16b, v22.16b, v27.16b
- add v6.4s, v6.4s, v22.4s
- eor v26.16b, v26.16b, v6.16b
- add v6.4s, v6.4s, v18.4s
- tbl v18.16b, { v26.16b }, v1.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v22.16b, v22.16b, v26.16b
- ext v18.16b, v18.16b, v18.16b, #8
- add v6.4s, v6.4s, v22.4s
- eor v18.16b, v6.16b, v18.16b
- ext v19.16b, v19.16b, v19.16b, #12
- tbl v18.16b, { v18.16b }, v0.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v6.4s, v7.4s
- orr v22.16b, v22.16b, v26.16b
- add v6.4s, v6.4s, v22.4s
- ext v26.16b, v6.16b, v6.16b, #12
- eor v6.16b, v18.16b, v6.16b
- uzp2 v4.4s, v4.4s, v7.4s
- zip2 v25.4s, v7.4s, v16.4s
- add v26.4s, v26.4s, v21.4s
- zip1 v20.2d, v16.2d, v7.2d
- tbl v6.16b, { v6.16b }, v1.16b
- ext v24.16b, v4.16b, v4.16b, #4
- tbl v27.16b, { v20.16b, v21.16b }, v2.16b
- zip1 v7.4s, v25.4s, v21.4s
- zip1 v20.4s, v21.4s, v25.4s
- add v18.4s, v19.4s, v6.4s
- uzp1 v5.4s, v24.4s, v24.4s
- ext v21.16b, v27.16b, v27.16b, #12
- ext v7.16b, v20.16b, v7.16b, #8
- eor v19.16b, v22.16b, v18.16b
- ext v5.16b, v5.16b, v24.16b, #8
- tbl v17.16b, { v28.16b, v29.16b }, v3.16b
- uzp1 v21.4s, v27.4s, v21.4s
- mov v28.16b, v7.16b
- ushr v22.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v23.16b, v24.16b, v24.16b, #12
- uzp2 v5.4s, v5.4s, v17.4s
- mov v28.s[1], v21.s[2]
- orr v19.16b, v19.16b, v22.16b
- ext v27.16b, v24.16b, v23.16b, #12
- ext v23.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #8
- ext v25.16b, v18.16b, v18.16b, #4
- add v18.4s, v26.4s, v19.4s
- uzp1 v24.4s, v23.4s, v23.4s
- eor v6.16b, v18.16b, v6.16b
- ext v24.16b, v24.16b, v23.16b, #8
- add v16.4s, v18.4s, v16.4s
- tbl v18.16b, { v27.16b, v28.16b }, v3.16b
- tbl v27.16b, { v6.16b }, v0.16b
- uzp2 v6.4s, v24.4s, v18.4s
- add v24.4s, v25.4s, v27.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v25.16b
- add v16.4s, v16.4s, v19.4s
- eor v25.16b, v27.16b, v16.16b
- add v4.4s, v16.4s, v4.4s
- tbl v16.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v16.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v16.16b, v16.16b, v16.16b, #8
- add v4.4s, v4.4s, v19.4s
- eor v16.16b, v4.16b, v16.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v25.16b, { v16.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v16.16b, v19.16b, v24.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v17.4s
- orr v19.16b, v16.16b, v19.16b
- add v27.4s, v4.4s, v19.4s
- eor v25.16b, v25.16b, v27.16b
- tbl v25.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v25.4s
- zip2 v26.4s, v17.4s, v7.4s
- ext v4.16b, v27.16b, v27.16b, #12
- eor v19.16b, v19.16b, v24.16b
- add v28.4s, v4.4s, v21.4s
- zip1 v20.2d, v7.2d, v17.2d
- zip1 v4.4s, v26.4s, v21.4s
- zip1 v17.4s, v21.4s, v26.4s
- ushr v26.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v26.16b
- ext v25.16b, v25.16b, v25.16b, #8
- add v27.4s, v28.4s, v19.4s
- eor v25.16b, v27.16b, v25.16b
- ext v24.16b, v24.16b, v24.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v19.16b, v19.16b, v24.16b
- add v7.4s, v27.4s, v7.4s
- ushr v27.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v27.16b
- add v7.4s, v7.4s, v19.4s
- eor v25.16b, v25.16b, v7.16b
- add v5.4s, v7.4s, v5.4s
- tbl v7.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v19.16b, v19.16b, v25.16b
+ hint #34
+ fmov s1, w3
+ movi d0, #0x0000ff000000ff
+ ldr q2, [x1]
+ adrp x8, .LCPI1_0
+ mov v1.s[1], w5
+ str q2, [x0]
+ ldr q4, [x8, :lo12:.LCPI1_0]
+ ldr q5, [x1, #16]
+ adrp x8, .LCPI1_1
+ and v0.8b, v1.8b, v0.8b
+ fmov d1, x4
+ stp q5, q4, [x0, #16]
+ mov v1.d[1], v0.d[0]
+ str q1, [x0, #48]
+ ldp q6, q7, [x2]
+ uzp1 v3.4s, v6.4s, v7.4s
+ add v0.4s, v2.4s, v3.4s
+ uzp2 v2.4s, v6.4s, v7.4s
+ add v16.4s, v0.4s, v5.4s
+ ldr q0, [x8, :lo12:.LCPI1_1]
+ adrp x8, .LCPI1_2
+ eor v1.16b, v16.16b, v1.16b
+ add v7.4s, v16.4s, v2.4s
+ tbl v1.16b, { v1.16b }, v0.16b
+ add v4.4s, v1.4s, v4.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v6.4s, v7.4s, v5.4s
+ eor v7.16b, v1.16b, v6.16b
+ ldr q1, [x8, :lo12:.LCPI1_2]
+ add x8, x2, #32
+ tbl v7.16b, { v7.16b }, v1.16b
+ ld2 { v16.4s, v17.4s }, [x8]
+ add v4.4s, v4.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #8
- add v5.4s, v5.4s, v19.4s
- eor v7.16b, v5.16b, v7.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v7.16b, { v7.16b }, v0.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- tbl v16.16b, { v20.16b, v21.16b }, v2.16b
- add v5.4s, v5.4s, v18.4s
- orr v19.16b, v19.16b, v25.16b
- ext v20.16b, v16.16b, v16.16b, #12
- ext v4.16b, v17.16b, v4.16b, #8
- add v5.4s, v5.4s, v19.4s
- uzp1 v21.4s, v16.4s, v20.4s
- mov v17.16b, v4.16b
- ext v25.16b, v5.16b, v5.16b, #12
- mov v17.s[1], v21.s[2]
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v4.2d, v18.2d
- ext v22.16b, v23.16b, v23.16b, #12
- zip2 v26.4s, v18.4s, v4.4s
- tbl v18.16b, { v20.16b, v21.16b }, v2.16b
- eor v5.16b, v7.16b, v5.16b
- ext v16.16b, v23.16b, v22.16b, #12
- ext v22.16b, v6.16b, v6.16b, #4
- zip1 v27.4s, v26.4s, v21.4s
- zip1 v20.4s, v21.4s, v26.4s
- ext v21.16b, v18.16b, v18.16b, #12
- tbl v5.16b, { v5.16b }, v1.16b
- ext v20.16b, v20.16b, v27.16b, #8
- uzp1 v27.4s, v18.4s, v21.4s
- uzp1 v18.4s, v22.4s, v22.4s
- add v21.4s, v24.4s, v5.4s
- ext v18.16b, v18.16b, v22.16b, #8
- eor v19.16b, v19.16b, v21.16b
- tbl v7.16b, { v16.16b, v17.16b }, v3.16b
- uzp2 v18.4s, v18.4s, v17.4s
- zip2 v16.4s, v16.4s, v20.4s
- ushr v17.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v17.16b, v19.16b, v17.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v19.4s, v25.4s, v17.4s
- eor v5.16b, v19.16b, v5.16b
- ext v21.16b, v21.16b, v21.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v4.4s, v19.4s, v4.4s
- add v19.4s, v21.4s, v5.4s
- eor v17.16b, v17.16b, v19.16b
- ushr v21.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- orr v17.16b, v17.16b, v21.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v4.4s, v4.4s, v6.4s
- add v6.4s, v19.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v17.16b, v17.16b, v19.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v17.4s
+ add v6.4s, v6.4s, v16.4s
eor v5.16b, v4.16b, v5.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #12
ext v6.16b, v6.16b, v6.16b, #12
- tbl v5.16b, { v5.16b }, v0.16b
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ ext v18.16b, v17.16b, v17.16b, #12
add v6.4s, v6.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
+ mov v17.16b, v18.16b
+ eor v7.16b, v7.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ mov v17.s[1], v16.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
add v4.4s, v4.4s, v7.4s
- orr v17.16b, v17.16b, v19.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- mov v29.16b, v20.16b
+ eor v5.16b, v4.16b, v5.16b
+ ushr v19.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v19.16b
+ uzp1 v19.4s, v3.4s, v3.4s
+ add v6.4s, v6.4s, v5.4s
+ ext v19.16b, v19.16b, v3.16b, #8
+ eor v7.16b, v7.16b, v6.16b
+ uzp2 v19.4s, v19.4s, v2.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v6.4s, v6.4s, v19.4s
+ add v4.4s, v4.4s, v7.4s
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v4.16b, v5.16b
ext v4.16b, v4.16b, v4.16b, #12
+ ushr v20.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v20.16b
+ ext v20.16b, v3.16b, v3.16b, #12
add v6.4s, v6.4s, v5.4s
- mov v29.s[1], v27.s[2]
- add v4.4s, v4.4s, v27.4s
- zip1 v26.2d, v20.2d, v7.2d
- zip1 v7.4s, v16.4s, v27.4s
- zip1 v16.4s, v27.4s, v16.4s
- eor v17.16b, v17.16b, v6.16b
- ext v7.16b, v16.16b, v7.16b, #8
- ushr v16.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- orr v16.16b, v17.16b, v16.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v16.4s
+ ext v3.16b, v3.16b, v20.16b, #12
+ eor v7.16b, v7.16b, v6.16b
+ rev64 v3.4s, v3.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ trn2 v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v7.4s
+ add v6.4s, v6.4s, v3.4s
eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v17.16b
+ zip1 v17.2d, v18.2d, v2.2d
+ zip2 v2.4s, v2.4s, v18.4s
add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- ushr v17.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v20.4s
- orr v16.16b, v16.16b, v17.16b
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
+ mov v17.s[3], v16.s[3]
+ zip1 v18.4s, v2.4s, v16.4s
+ zip1 v2.4s, v16.4s, v2.4s
+ eor v7.16b, v7.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v16.16b, v2.16b, v18.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v20.4s, v4.4s, v7.4s
+ ext v4.16b, v17.16b, v17.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v20.16b, v5.16b
+ uzp1 v4.4s, v17.4s, v4.4s
+ ushr v17.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v5.16b, v5.16b, v17.16b
+ ext v17.16b, v20.16b, v20.16b, #4
add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
+ eor v7.16b, v7.16b, v6.16b
+ add v6.4s, v6.4s, v16.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v17.4s, v17.4s, v7.4s
+ eor v5.16b, v17.16b, v5.16b
+ ushr v2.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v2.16b, v5.16b, v2.16b
+ add v5.4s, v6.4s, v2.4s
+ ext v6.16b, v19.16b, v19.16b, #4
+ eor v7.16b, v7.16b, v5.16b
+ uzp1 v18.4s, v6.4s, v6.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ ext v18.16b, v18.16b, v6.16b, #8
+ add v17.4s, v17.4s, v7.4s
+ uzp2 v18.4s, v18.4s, v3.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v2.16b, v17.16b, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ ext v17.16b, v17.16b, v17.16b, #12
+ ushr v19.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v2.16b, v2.16b, v19.16b
+ ext v19.16b, v6.16b, v6.16b, #12
+ add v5.4s, v5.4s, v2.4s
+ ext v6.16b, v6.16b, v19.16b, #12
+ mov v19.16b, v16.16b
+ eor v7.16b, v7.16b, v5.16b
+ rev64 v6.4s, v6.4s
+ mov v19.s[1], v4.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v17.4s, v17.4s, v7.4s
+ eor v20.16b, v17.16b, v2.16b
+ trn2 v2.4s, v6.4s, v19.4s
+ ushr v6.4s, v20.4s, #12
+ shl v19.4s, v20.4s, #20
+ add v5.4s, v5.4s, v2.4s
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v5.4s, v6.4s
+ eor v5.16b, v7.16b, v19.16b
+ zip1 v7.2d, v16.2d, v3.2d
+ zip2 v3.4s, v3.4s, v16.4s
+ tbl v20.16b, { v5.16b }, v1.16b
+ mov v7.s[3], v4.s[3]
+ add v17.4s, v17.4s, v20.4s
+ ext v5.16b, v7.16b, v7.16b, #12
+ eor v6.16b, v17.16b, v6.16b
+ uzp1 v5.4s, v7.4s, v5.4s
+ ext v7.16b, v19.16b, v19.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v19.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v7.4s, v7.4s, v5.4s
+ orr v6.16b, v6.16b, v19.16b
+ ext v19.16b, v20.16b, v20.16b, #8
+ add v7.4s, v7.4s, v6.4s
+ eor v19.16b, v19.16b, v7.16b
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v16.4s, v17.4s, v19.4s
+ zip1 v17.4s, v3.4s, v4.4s
+ zip1 v3.4s, v4.4s, v3.4s
+ eor v4.16b, v16.16b, v6.16b
+ ext v17.16b, v3.16b, v17.16b, #8
+ ushr v3.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v6.4s, v7.4s, v17.4s
+ orr v3.16b, v4.16b, v3.16b
+ add v4.4s, v6.4s, v3.4s
+ ext v6.16b, v18.16b, v18.16b, #4
+ eor v7.16b, v19.16b, v4.16b
+ uzp1 v18.4s, v6.4s, v6.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ ext v18.16b, v18.16b, v6.16b, #8
+ add v16.4s, v16.4s, v7.4s
+ uzp2 v18.4s, v18.4s, v2.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v3.16b, v16.16b, v3.16b
add v4.4s, v4.4s, v18.4s
- ushr v17.4s, v16.4s, #7
- shl v16.4s, v16.4s, #25
- ext v23.16b, v22.16b, v22.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ushr v19.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
ext v4.16b, v4.16b, v4.16b, #4
- orr v16.16b, v16.16b, v17.16b
- ext v28.16b, v22.16b, v23.16b, #12
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v16.4s, v4.4s
- tbl v3.16b, { v28.16b, v29.16b }, v3.16b
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v4.4s, v3.4s
- tbl v4.16b, { v5.16b }, v0.16b
- add v5.4s, v6.4s, v4.4s
- eor v6.16b, v16.16b, v5.16b
- ushr v16.4s, v6.4s, #12
+ orr v3.16b, v3.16b, v19.16b
+ ext v19.16b, v6.16b, v6.16b, #12
+ add v4.4s, v4.4s, v3.4s
+ ext v6.16b, v6.16b, v19.16b, #12
+ mov v19.16b, v17.16b
+ eor v7.16b, v7.16b, v4.16b
+ rev64 v6.4s, v6.4s
+ mov v19.s[1], v5.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v20.16b, v16.16b, v3.16b
+ trn2 v3.4s, v6.4s, v19.4s
+ ushr v6.4s, v20.4s, #12
+ shl v19.4s, v20.4s, #20
+ add v4.4s, v4.4s, v3.4s
+ orr v6.16b, v19.16b, v6.16b
+ zip1 v19.2d, v17.2d, v2.2d
+ zip2 v2.4s, v2.4s, v17.4s
+ add v4.4s, v4.4s, v6.4s
+ mov v19.s[3], v5.s[3]
+ zip1 v17.4s, v2.4s, v5.4s
+ zip1 v2.4s, v5.4s, v2.4s
+ eor v7.16b, v7.16b, v4.16b
+ ext v20.16b, v19.16b, v19.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #12
+ ext v2.16b, v2.16b, v17.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v16.4s, v16.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v21.16b, v16.16b, v6.16b
+ uzp1 v6.4s, v19.4s, v20.4s
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v19.4s, v21.4s, #7
+ shl v20.4s, v21.4s, #25
+ add v4.4s, v4.4s, v6.4s
+ orr v19.16b, v20.16b, v19.16b
+ add v4.4s, v4.4s, v19.4s
+ eor v7.16b, v7.16b, v4.16b
+ add v4.4s, v4.4s, v2.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v5.16b, v16.16b, v19.16b
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v17.16b
+ ext v17.16b, v18.16b, v18.16b, #4
+ add v4.4s, v4.4s, v5.4s
+ uzp1 v18.4s, v17.4s, v17.4s
+ eor v7.16b, v7.16b, v4.16b
+ ext v18.16b, v18.16b, v17.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ uzp2 v18.4s, v18.4s, v3.4s
+ add v16.4s, v16.4s, v7.4s
+ add v4.4s, v4.4s, v18.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v16.16b, v5.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #12
+ ushr v19.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v19.16b
+ add v19.4s, v4.4s, v5.4s
+ eor v4.16b, v7.16b, v19.16b
+ ext v7.16b, v17.16b, v17.16b, #12
+ tbl v20.16b, { v4.16b }, v0.16b
+ ext v4.16b, v17.16b, v7.16b, #12
+ mov v7.16b, v2.16b
+ add v16.4s, v16.4s, v20.4s
+ rev64 v4.4s, v4.4s
+ mov v7.s[1], v6.s[2]
+ eor v5.16b, v16.16b, v5.16b
+ trn2 v4.4s, v4.4s, v7.4s
+ ushr v7.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v19.4s, v4.4s
+ zip1 v19.2d, v2.2d, v3.2d
+ zip2 v2.4s, v3.4s, v2.4s
+ orr v5.16b, v5.16b, v7.16b
+ mov v19.s[3], v6.s[3]
+ add v7.4s, v17.4s, v5.4s
+ eor v17.16b, v20.16b, v7.16b
+ ext v20.16b, v19.16b, v19.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v16.4s, v16.4s, v17.4s
+ ext v17.16b, v17.16b, v17.16b, #8
+ eor v21.16b, v16.16b, v5.16b
+ uzp1 v5.4s, v19.4s, v20.4s
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v19.4s, v21.4s, #7
+ shl v20.4s, v21.4s, #25
+ add v7.4s, v7.4s, v5.4s
+ orr v19.16b, v20.16b, v19.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v17.16b, v17.16b, v7.16b
+ tbl v17.16b, { v17.16b }, v0.16b
+ add v3.4s, v16.4s, v17.4s
+ zip1 v16.4s, v2.4s, v6.4s
+ zip1 v2.4s, v6.4s, v2.4s
+ eor v6.16b, v3.16b, v19.16b
+ ext v16.16b, v2.16b, v16.16b, #8
+ ushr v2.4s, v6.4s, #12
shl v6.4s, v6.4s, #20
- orr v6.16b, v6.16b, v16.16b
- tbl v2.16b, { v26.16b, v27.16b }, v2.16b
- add v3.4s, v3.4s, v6.4s
- ext v19.16b, v2.16b, v2.16b, #12
- eor v4.16b, v4.16b, v3.16b
- uzp1 v2.4s, v2.4s, v19.4s
+ add v7.4s, v7.4s, v16.4s
+ orr v2.16b, v6.16b, v2.16b
+ add v6.4s, v7.4s, v2.4s
+ ext v7.16b, v18.16b, v18.16b, #4
+ eor v17.16b, v17.16b, v6.16b
+ uzp1 v18.4s, v7.4s, v7.4s
+ tbl v17.16b, { v17.16b }, v1.16b
+ ext v18.16b, v18.16b, v7.16b, #8
+ add v3.4s, v3.4s, v17.4s
+ uzp2 v18.4s, v18.4s, v4.4s
+ eor v2.16b, v3.16b, v2.16b
+ add v6.4s, v6.4s, v18.4s
ext v3.16b, v3.16b, v3.16b, #12
- tbl v4.16b, { v4.16b }, v1.16b
- add v2.4s, v3.4s, v2.4s
- add v3.4s, v5.4s, v4.4s
- eor v5.16b, v6.16b, v3.16b
- ushr v6.4s, v5.4s, #7
+ ext v18.16b, v18.16b, v18.16b, #4
+ ushr v19.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v19.16b, v2.16b, v19.16b
+ ext v2.16b, v17.16b, v17.16b, #8
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ eor v2.16b, v2.16b, v6.16b
+ tbl v20.16b, { v2.16b }, v0.16b
+ ext v2.16b, v7.16b, v17.16b, #12
+ mov v7.16b, v16.16b
+ add v17.4s, v3.4s, v20.4s
+ rev64 v3.4s, v2.4s
+ mov v7.s[1], v5.s[2]
+ eor v19.16b, v17.16b, v19.16b
+ trn2 v3.4s, v3.4s, v7.4s
+ ushr v21.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ add v6.4s, v6.4s, v3.4s
+ orr v19.16b, v19.16b, v21.16b
+ add v21.4s, v6.4s, v19.4s
+ eor v6.16b, v20.16b, v21.16b
+ zip1 v20.2d, v16.2d, v4.2d
+ zip2 v4.4s, v4.4s, v16.4s
+ tbl v22.16b, { v6.16b }, v1.16b
+ mov v20.s[3], v5.s[3]
+ add v17.4s, v17.4s, v22.4s
+ ext v6.16b, v20.16b, v20.16b, #12
+ eor v19.16b, v17.16b, v19.16b
+ uzp1 v6.4s, v20.4s, v6.4s
+ ext v20.16b, v21.16b, v21.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v21.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ add v20.4s, v20.4s, v6.4s
+ orr v19.16b, v19.16b, v21.16b
+ ext v21.16b, v22.16b, v22.16b, #8
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v21.16b, v20.16b
+ tbl v21.16b, { v21.16b }, v0.16b
+ add v16.4s, v17.4s, v21.4s
+ zip1 v17.4s, v4.4s, v5.4s
+ zip1 v4.4s, v5.4s, v4.4s
+ eor v5.16b, v16.16b, v19.16b
+ ext v4.16b, v4.16b, v17.16b, #8
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v19.4s, v20.4s, v4.4s
+ ext v20.16b, v18.16b, v18.16b, #8
+ zip1 v3.2d, v4.2d, v3.2d
+ orr v5.16b, v5.16b, v17.16b
+ zip2 v2.4s, v2.4s, v4.4s
+ uzp2 v7.4s, v20.4s, v7.4s
+ mov v3.s[3], v6.s[3]
+ add v17.4s, v19.4s, v5.4s
+ ext v7.16b, v7.16b, v20.16b, #4
+ eor v19.16b, v21.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ tbl v19.16b, { v19.16b }, v1.16b
+ add v7.4s, v17.4s, v7.4s
+ add v16.4s, v16.4s, v19.4s
+ ext v17.16b, v19.16b, v19.16b, #8
+ ext v19.16b, v18.16b, v18.16b, #12
+ eor v5.16b, v16.16b, v5.16b
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v18.16b, v18.16b, v19.16b, #12
+ mov v19.16b, v4.16b
+ ushr v20.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v6.16b
- ext v4.16b, v4.16b, v4.16b, #8
- add v2.4s, v2.4s, v5.4s
- eor v4.16b, v2.16b, v4.16b
- ext v3.16b, v3.16b, v3.16b, #4
- tbl v0.16b, { v4.16b }, v0.16b
- add v3.4s, v3.4s, v0.4s
- eor v4.16b, v5.16b, v3.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v2.4s, v2.4s, v7.4s
- orr v4.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v4.4s
+ rev64 v18.4s, v18.4s
+ mov v19.s[1], v6.s[2]
+ orr v5.16b, v5.16b, v20.16b
+ trn2 v18.4s, v18.4s, v19.4s
+ add v7.4s, v5.4s, v7.4s
+ eor v17.16b, v17.16b, v7.16b
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v3.16b, v3.16b, #12
+ tbl v17.16b, { v17.16b }, v0.16b
+ uzp1 v3.4s, v3.4s, v18.4s
+ add v16.4s, v16.4s, v17.4s
+ eor v5.16b, v16.16b, v5.16b
+ ushr v19.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v19.16b
+ add v7.4s, v7.4s, v5.4s
+ eor v17.16b, v17.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v3.4s, v7.4s, v3.4s
+ add v16.4s, v16.4s, v17.4s
+ ext v7.16b, v17.16b, v17.16b, #8
+ eor v5.16b, v16.16b, v5.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ add v3.4s, v3.4s, v5.4s
+ eor v7.16b, v7.16b, v3.16b
+ tbl v0.16b, { v7.16b }, v0.16b
+ zip1 v7.4s, v2.4s, v6.4s
+ zip1 v2.4s, v6.4s, v2.4s
+ add v4.4s, v16.4s, v0.4s
+ ext v2.16b, v2.16b, v7.16b, #8
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v3.4s, v2.4s
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v3.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v3.4s
eor v0.16b, v0.16b, v2.16b
- tbl v0.16b, { v0.16b }, v1.16b
- add v1.4s, v3.4s, v0.4s
- eor v3.16b, v4.16b, v1.16b
ext v2.16b, v2.16b, v2.16b, #4
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v4.4s, v0.4s
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v3.16b, v1.16b, v3.16b
ext v1.16b, v1.16b, v1.16b, #12
ushr v4.4s, v3.4s, #7
shl v3.4s, v3.4s, #25
- ext v0.16b, v0.16b, v0.16b, #8
- eor v1.16b, v2.16b, v1.16b
- orr v2.16b, v3.16b, v4.16b
+ stp q1, q0, [x0, #32]
+ orr v3.16b, v3.16b, v4.16b
+ stp q2, q3, [x0]
+ ret
+.Lfunc_end1:
+ .size compress_pre, .Lfunc_end1-compress_pre
+ .cfi_endproc
+
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+ .cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ stp x20, x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x20, x0
+ mov x19, x5
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x20
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- stp q1, q0, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr q0, [x20]
+ eor v0.16b, v0.16b, v2.16b
+ str q0, [x19, #32]
+ ldr q0, [x20, #16]
+ eor v0.16b, v0.16b, v3.16b
+ str q0, [x19, #48]
+ ldp x20, x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
-.Lfunc_end0:
- .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+.Lfunc_end2:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
-.LCPI1_0:
+.LCPI3_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+.LCPI3_1:
.byte 2
.byte 3
.byte 0
.byte 15
.byte 12
.byte 13
-.LCPI1_1:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI1_2:
+.LCPI3_2:
.byte 1
.byte 2
.byte 3
.byte 14
.byte 15
.byte 12
-.LCPI1_3:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 20
- .byte 21
- .byte 22
- .byte 23
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
-.LCPI1_4:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 4
- .byte 5
- .byte 6
- .byte 7
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
+.LCPI3_3:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
.text
- .globl zfs_blake3_compress_xof_sse41
+ .globl zfs_blake3_hash_many_sse41
.p2align 2
- .type zfs_blake3_compress_xof_sse41,@function
-zfs_blake3_compress_xof_sse41:
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
.cfi_startproc
- ldp q7, q6, [x0]
- ldp q17, q18, [x1]
- add x12, x1, #32
- ld2 { v4.4s, v5.4s }, [x12]
- lsr x10, x3, #32
- fmov s16, w3
- adrp x13, .LCPI1_0
- adrp x11, .LCPI1_1
- and w8, w2, #0xff
- mov v16.s[1], w10
- ldr q0, [x13, :lo12:.LCPI1_0]
- ldr q20, [x11, :lo12:.LCPI1_1]
- adrp x11, .LCPI1_4
- and w9, w4, #0xff
- ldr q2, [x11, :lo12:.LCPI1_4]
- mov v16.s[2], w8
- uzp1 v21.4s, v17.4s, v18.4s
- add v7.4s, v6.4s, v7.4s
- adrp x12, .LCPI1_3
- mov v16.s[3], w9
- uzp2 v18.4s, v17.4s, v18.4s
- add v7.4s, v7.4s, v21.4s
- ext v17.16b, v5.16b, v5.16b, #12
- ldr q3, [x12, :lo12:.LCPI1_3]
- ext v24.16b, v4.16b, v4.16b, #12
- eor v16.16b, v7.16b, v16.16b
- mov v27.16b, v17.16b
- uzp1 v19.4s, v21.4s, v21.4s
- ext v25.16b, v21.16b, v21.16b, #12
- zip2 v28.4s, v18.4s, v17.4s
- tbl v29.16b, { v16.16b }, v0.16b
- mov v27.s[1], v24.s[2]
- zip1 v23.2d, v17.2d, v18.2d
- ext v19.16b, v19.16b, v21.16b, #8
- add v22.4s, v29.4s, v20.4s
- ext v26.16b, v21.16b, v25.16b, #12
- tbl v20.16b, { v23.16b, v24.16b }, v2.16b
- zip1 v21.4s, v28.4s, v24.4s
- zip1 v23.4s, v24.4s, v28.4s
- uzp2 v19.4s, v19.4s, v18.4s
- eor v24.16b, v22.16b, v6.16b
- ext v25.16b, v20.16b, v20.16b, #12
- ext v6.16b, v23.16b, v21.16b, #8
- add v7.4s, v7.4s, v18.4s
- ext v18.16b, v19.16b, v19.16b, #4
- tbl v16.16b, { v26.16b, v27.16b }, v3.16b
- uzp1 v21.4s, v20.4s, v25.4s
- mov v26.16b, v6.16b
- ext v23.16b, v18.16b, v18.16b, #12
- mov v26.s[1], v21.s[2]
- adrp x10, .LCPI1_2
- ext v25.16b, v18.16b, v23.16b, #12
- uzp1 v23.4s, v18.4s, v18.4s
- ldr q1, [x10, :lo12:.LCPI1_2]
- ext v18.16b, v23.16b, v18.16b, #8
- ushr v23.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- orr v23.16b, v24.16b, v23.16b
- add v7.4s, v7.4s, v23.4s
- eor v27.16b, v29.16b, v7.16b
- add v4.4s, v7.4s, v4.4s
- tbl v7.16b, { v25.16b, v26.16b }, v3.16b
- tbl v26.16b, { v27.16b }, v1.16b
- add v22.4s, v22.4s, v26.4s
- uzp2 v18.4s, v18.4s, v16.4s
- eor v23.16b, v23.16b, v22.16b
- ext v5.16b, v18.16b, v18.16b, #4
- ushr v27.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- uzp1 v25.4s, v5.4s, v5.4s
- orr v23.16b, v23.16b, v27.16b
- ext v28.16b, v4.16b, v4.16b, #12
- ext v4.16b, v25.16b, v5.16b, #8
- ext v25.16b, v26.16b, v26.16b, #8
- add v26.4s, v28.4s, v23.4s
- eor v25.16b, v26.16b, v25.16b
- ext v22.16b, v22.16b, v22.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v22.4s, v22.4s, v25.4s
- eor v23.16b, v23.16b, v22.16b
- add v17.4s, v26.4s, v17.4s
- ushr v26.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v26.16b
- add v17.4s, v17.4s, v23.4s
- eor v25.16b, v25.16b, v17.16b
- add v17.4s, v17.4s, v19.4s
- tbl v19.16b, { v25.16b }, v1.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- ext v17.16b, v17.16b, v17.16b, #4
- orr v23.16b, v23.16b, v25.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v17.4s, v17.4s, v23.4s
- eor v19.16b, v17.16b, v19.16b
- ext v22.16b, v22.16b, v22.16b, #12
- tbl v19.16b, { v19.16b }, v0.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v17.4s, v17.4s, v16.4s
- orr v23.16b, v23.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- ext v25.16b, v17.16b, v17.16b, #12
- eor v17.16b, v19.16b, v17.16b
- tbl v17.16b, { v17.16b }, v1.16b
- add v19.4s, v22.4s, v17.4s
- eor v22.16b, v23.16b, v19.16b
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v6.2d, v16.2d
- ushr v23.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- zip2 v24.4s, v16.4s, v6.4s
- tbl v26.16b, { v20.16b, v21.16b }, v2.16b
- orr v22.16b, v22.16b, v23.16b
- zip1 v16.4s, v24.4s, v21.4s
- zip1 v20.4s, v21.4s, v24.4s
- ext v21.16b, v26.16b, v26.16b, #12
- ext v17.16b, v17.16b, v17.16b, #8
- add v25.4s, v25.4s, v22.4s
- ext v16.16b, v20.16b, v16.16b, #8
- uzp1 v21.4s, v26.4s, v21.4s
- eor v26.16b, v25.16b, v17.16b
- ext v19.16b, v19.16b, v19.16b, #4
- tbl v26.16b, { v26.16b }, v0.16b
- mov v29.16b, v16.16b
- add v19.4s, v19.4s, v26.4s
- ext v27.16b, v5.16b, v5.16b, #12
- mov v29.s[1], v21.s[2]
- eor v22.16b, v22.16b, v19.16b
- ext v28.16b, v5.16b, v27.16b, #12
- ushr v27.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v25.4s, v6.4s
- orr v22.16b, v22.16b, v27.16b
- add v6.4s, v6.4s, v22.4s
- eor v26.16b, v26.16b, v6.16b
- add v6.4s, v6.4s, v18.4s
- tbl v18.16b, { v26.16b }, v1.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v22.16b, v22.16b, v26.16b
- ext v18.16b, v18.16b, v18.16b, #8
- add v6.4s, v6.4s, v22.4s
- eor v18.16b, v6.16b, v18.16b
- ext v19.16b, v19.16b, v19.16b, #12
- tbl v18.16b, { v18.16b }, v0.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v6.4s, v7.4s
- orr v22.16b, v22.16b, v26.16b
- add v6.4s, v6.4s, v22.4s
- ext v26.16b, v6.16b, v6.16b, #12
- eor v6.16b, v18.16b, v6.16b
- uzp2 v4.4s, v4.4s, v7.4s
- zip2 v25.4s, v7.4s, v16.4s
- add v26.4s, v26.4s, v21.4s
- zip1 v20.2d, v16.2d, v7.2d
- tbl v6.16b, { v6.16b }, v1.16b
- ext v24.16b, v4.16b, v4.16b, #4
- tbl v27.16b, { v20.16b, v21.16b }, v2.16b
- zip1 v7.4s, v25.4s, v21.4s
- zip1 v20.4s, v21.4s, v25.4s
- add v18.4s, v19.4s, v6.4s
- uzp1 v5.4s, v24.4s, v24.4s
- ext v21.16b, v27.16b, v27.16b, #12
- ext v7.16b, v20.16b, v7.16b, #8
- eor v19.16b, v22.16b, v18.16b
- ext v5.16b, v5.16b, v24.16b, #8
- tbl v17.16b, { v28.16b, v29.16b }, v3.16b
- uzp1 v21.4s, v27.4s, v21.4s
- mov v28.16b, v7.16b
- ushr v22.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v23.16b, v24.16b, v24.16b, #12
- uzp2 v5.4s, v5.4s, v17.4s
- mov v28.s[1], v21.s[2]
- orr v19.16b, v19.16b, v22.16b
- ext v27.16b, v24.16b, v23.16b, #12
- ext v23.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #8
- ext v25.16b, v18.16b, v18.16b, #4
- add v18.4s, v26.4s, v19.4s
- uzp1 v24.4s, v23.4s, v23.4s
- eor v6.16b, v18.16b, v6.16b
- ext v24.16b, v24.16b, v23.16b, #8
- add v16.4s, v18.4s, v16.4s
- tbl v18.16b, { v27.16b, v28.16b }, v3.16b
- tbl v27.16b, { v6.16b }, v0.16b
- uzp2 v6.4s, v24.4s, v18.4s
- add v24.4s, v25.4s, v27.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v25.16b
- add v16.4s, v16.4s, v19.4s
- eor v25.16b, v27.16b, v16.16b
- add v4.4s, v16.4s, v4.4s
- tbl v16.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v16.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v16.16b, v16.16b, v16.16b, #8
- add v4.4s, v4.4s, v19.4s
- eor v16.16b, v4.16b, v16.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v25.16b, { v16.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v16.16b, v19.16b, v24.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v17.4s
- orr v19.16b, v16.16b, v19.16b
- add v27.4s, v4.4s, v19.4s
- eor v25.16b, v25.16b, v27.16b
- tbl v25.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v25.4s
- zip2 v26.4s, v17.4s, v7.4s
- ext v4.16b, v27.16b, v27.16b, #12
- eor v19.16b, v19.16b, v24.16b
- add v28.4s, v4.4s, v21.4s
- zip1 v20.2d, v7.2d, v17.2d
- zip1 v4.4s, v26.4s, v21.4s
- zip1 v17.4s, v21.4s, v26.4s
- ushr v26.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v26.16b
- ext v25.16b, v25.16b, v25.16b, #8
- add v27.4s, v28.4s, v19.4s
- eor v25.16b, v27.16b, v25.16b
- ext v24.16b, v24.16b, v24.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v19.16b, v19.16b, v24.16b
- add v7.4s, v27.4s, v7.4s
- ushr v27.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v27.16b
- add v7.4s, v7.4s, v19.4s
- eor v25.16b, v25.16b, v7.16b
- add v5.4s, v7.4s, v5.4s
- tbl v7.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v7.16b, v7.16b, v7.16b, #8
- add v5.4s, v5.4s, v19.4s
- eor v7.16b, v5.16b, v7.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v7.16b, { v7.16b }, v0.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- tbl v16.16b, { v20.16b, v21.16b }, v2.16b
- add v5.4s, v5.4s, v18.4s
- orr v19.16b, v19.16b, v25.16b
- ext v20.16b, v16.16b, v16.16b, #12
- ext v4.16b, v17.16b, v4.16b, #8
- add v5.4s, v5.4s, v19.4s
- uzp1 v21.4s, v16.4s, v20.4s
- mov v17.16b, v4.16b
- ext v25.16b, v5.16b, v5.16b, #12
- mov v17.s[1], v21.s[2]
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v4.2d, v18.2d
- ext v22.16b, v23.16b, v23.16b, #12
- zip2 v26.4s, v18.4s, v4.4s
- tbl v18.16b, { v20.16b, v21.16b }, v2.16b
- eor v5.16b, v7.16b, v5.16b
- ext v16.16b, v23.16b, v22.16b, #12
- ext v22.16b, v6.16b, v6.16b, #4
- zip1 v27.4s, v26.4s, v21.4s
- zip1 v20.4s, v21.4s, v26.4s
- ext v21.16b, v18.16b, v18.16b, #12
- tbl v5.16b, { v5.16b }, v1.16b
- ext v20.16b, v20.16b, v27.16b, #8
- uzp1 v27.4s, v18.4s, v21.4s
- uzp1 v18.4s, v22.4s, v22.4s
- add v21.4s, v24.4s, v5.4s
- ext v18.16b, v18.16b, v22.16b, #8
- eor v19.16b, v19.16b, v21.16b
- tbl v7.16b, { v16.16b, v17.16b }, v3.16b
- uzp2 v18.4s, v18.4s, v17.4s
- zip2 v16.4s, v16.4s, v20.4s
- ushr v17.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v17.16b, v19.16b, v17.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v19.4s, v25.4s, v17.4s
- eor v5.16b, v19.16b, v5.16b
- ext v21.16b, v21.16b, v21.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v4.4s, v19.4s, v4.4s
- add v19.4s, v21.4s, v5.4s
- eor v17.16b, v17.16b, v19.16b
- ushr v21.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- orr v17.16b, v17.16b, v21.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v4.4s, v4.4s, v6.4s
- add v6.4s, v19.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v17.16b, v17.16b, v19.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- tbl v5.16b, { v5.16b }, v0.16b
- add v6.4s, v6.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- add v4.4s, v4.4s, v7.4s
- orr v17.16b, v17.16b, v19.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- mov v29.16b, v20.16b
- ext v4.16b, v4.16b, v4.16b, #12
- add v6.4s, v6.4s, v5.4s
- mov v29.s[1], v27.s[2]
- add v4.4s, v4.4s, v27.4s
- zip1 v26.2d, v20.2d, v7.2d
- zip1 v7.4s, v16.4s, v27.4s
- zip1 v16.4s, v27.4s, v16.4s
- eor v17.16b, v17.16b, v6.16b
- ext v7.16b, v16.16b, v7.16b, #8
- ushr v16.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- orr v16.16b, v17.16b, v16.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- ushr v17.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v20.4s
- orr v16.16b, v16.16b, v17.16b
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- add v4.4s, v4.4s, v18.4s
- ushr v17.4s, v16.4s, #7
- shl v16.4s, v16.4s, #25
- ext v23.16b, v22.16b, v22.16b, #12
- ext v4.16b, v4.16b, v4.16b, #4
- orr v16.16b, v16.16b, v17.16b
- ext v28.16b, v22.16b, v23.16b, #12
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v16.4s, v4.4s
- tbl v3.16b, { v28.16b, v29.16b }, v3.16b
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v4.4s, v3.4s
- tbl v4.16b, { v5.16b }, v0.16b
- add v5.4s, v6.4s, v4.4s
- eor v6.16b, v16.16b, v5.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- orr v6.16b, v6.16b, v16.16b
- tbl v2.16b, { v26.16b, v27.16b }, v2.16b
- add v3.4s, v3.4s, v6.4s
- ext v19.16b, v2.16b, v2.16b, #12
- eor v4.16b, v4.16b, v3.16b
- uzp1 v2.4s, v2.4s, v19.4s
- ext v3.16b, v3.16b, v3.16b, #12
- tbl v4.16b, { v4.16b }, v1.16b
- add v2.4s, v3.4s, v2.4s
- add v3.4s, v5.4s, v4.4s
- eor v5.16b, v6.16b, v3.16b
- ushr v6.4s, v5.4s, #7
- shl v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v6.16b
- ext v4.16b, v4.16b, v4.16b, #8
- add v2.4s, v2.4s, v5.4s
- eor v4.16b, v2.16b, v4.16b
- ext v3.16b, v3.16b, v3.16b, #4
- tbl v0.16b, { v4.16b }, v0.16b
- add v3.4s, v3.4s, v0.4s
- eor v4.16b, v5.16b, v3.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v2.4s, v2.4s, v7.4s
- orr v4.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v4.4s
- eor v0.16b, v0.16b, v2.16b
- tbl v0.16b, { v0.16b }, v1.16b
- add v1.4s, v3.4s, v0.4s
- eor v3.16b, v4.16b, v1.16b
- ushr v4.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- ext v0.16b, v0.16b, v0.16b, #8
- ext v1.16b, v1.16b, v1.16b, #12
- orr v3.16b, v3.16b, v4.16b
- eor v2.16b, v2.16b, v1.16b
- eor v3.16b, v3.16b, v0.16b
- stp q2, q3, [x5]
- ldr q2, [x0]
- eor v1.16b, v2.16b, v1.16b
- str q1, [x5, #32]
- ldr q1, [x0, #16]
- eor v0.16b, v1.16b, v0.16b
- str q0, [x5, #48]
- ret
-.Lfunc_end1:
- .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
- .cfi_endproc
-
- .section .rodata.cst16,"aM",@progbits,16
- .p2align 4
-.LCPI2_0:
- .word 0
- .word 1
- .word 2
- .word 3
-.LCPI2_1:
- .byte 2
- .byte 3
- .byte 0
- .byte 1
- .byte 6
- .byte 7
- .byte 4
- .byte 5
- .byte 10
- .byte 11
- .byte 8
- .byte 9
- .byte 14
- .byte 15
- .byte 12
- .byte 13
-.LCPI2_2:
- .byte 1
- .byte 2
- .byte 3
- .byte 0
- .byte 5
- .byte 6
- .byte 7
- .byte 4
- .byte 9
- .byte 10
- .byte 11
- .byte 8
- .byte 13
- .byte 14
- .byte 15
- .byte 12
- .text
- .globl zfs_blake3_hash_many_sse41
- .p2align 2
- .type zfs_blake3_hash_many_sse41,@function
-zfs_blake3_hash_many_sse41:
- .cfi_startproc
- stp d15, d14, [sp, #-160]!
+ hint #34
+ stp d15, d14, [sp, #-144]!
stp d13, d12, [sp, #16]
stp d11, d10, [sp, #32]
stp d9, d8, [sp, #48]
- stp x29, x30, [sp, #64]
- stp x28, x27, [sp, #80]
- stp x26, x25, [sp, #96]
- stp x24, x23, [sp, #112]
- stp x22, x21, [sp, #128]
- stp x20, x19, [sp, #144]
- mov x29, sp
- sub sp, sp, #448
- .cfi_def_cfa w29, 160
+ stp x29, x27, [sp, #64]
+ stp x26, x25, [sp, #80]
+ stp x24, x23, [sp, #96]
+ stp x22, x21, [sp, #112]
+ stp x20, x19, [sp, #128]
+ sub sp, sp, #368
+ .cfi_def_cfa_offset 512
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
.cfi_offset w25, -56
.cfi_offset w26, -64
.cfi_offset w27, -72
- .cfi_offset w28, -80
- .cfi_offset w30, -88
- .cfi_offset w29, -96
- .cfi_offset b8, -104
- .cfi_offset b9, -112
- .cfi_offset b10, -120
- .cfi_offset b11, -128
- .cfi_offset b12, -136
- .cfi_offset b13, -144
- .cfi_offset b14, -152
- .cfi_offset b15, -160
- ldr x26, [x29, #168]
- ldrb w27, [x29, #160]
- mov w19, w6
- mov x20, x4
- mov x22, x2
- mov x28, x1
+ .cfi_offset w29, -80
+ .cfi_offset b8, -88
+ .cfi_offset b9, -96
+ .cfi_offset b10, -104
+ .cfi_offset b11, -112
+ .cfi_offset b12, -120
+ .cfi_offset b13, -128
+ .cfi_offset b14, -136
+ .cfi_offset b15, -144
+ ldr x8, [sp, #520]
+ adrp x11, .LCPI3_1
+ ldrb w9, [sp, #512]
+ adrp x10, .LCPI3_2
cmp x1, #4
- mov x24, x0
- str x3, [sp, #40]
- b.lo .LBB2_8
- adrp x11, .LCPI2_0
- ldr q0, [x11, :lo12:.LCPI2_0]
+ b.lo .LBB3_6
+ adrp x12, .LCPI3_0
sbfx w13, w5, #0, #1
+ mov w15, #58983
+ mov w16, #44677
+ movk w15, #27145, lsl #16
+ movk w16, #47975, lsl #16
+ ldr q0, [x12, :lo12:.LCPI3_0]
dup v1.4s, w13
- mov w10, #58983
- mov w11, #44677
- mov w12, #62322
+ movi v13.4s, #64
+ mov w13, #62322
+ mov w14, #62778
+ orr w12, w7, w6
and v0.16b, v1.16b, v0.16b
- mov w13, #62778
- orr w8, w7, w19
- adrp x9, .LCPI2_1
- movk w10, #27145, lsl #16
- movk w11, #47975, lsl #16
- movk w12, #15470, lsl #16
- movk w13, #42319, lsl #16
- str q0, [sp, #16]
+ ldr q1, [x11, :lo12:.LCPI3_1]
+ movk w13, #15470, lsl #16
+ movk w14, #42319, lsl #16
+ dup v14.4s, w15
+ stp q0, q1, [sp, #16]
orr v0.4s, #128, lsl #24
- adrp x14, .LCPI2_2
str q0, [sp]
-.LBB2_2:
- ldr x2, [sp, #40]
- mov x15, x2
- ld1r { v7.4s }, [x15], #4
- add x16, x2, #8
- add x17, x2, #12
- add x18, x2, #16
- add x0, x2, #20
- add x3, x2, #24
- add x2, x2, #28
- ld1r { v6.4s }, [x16]
- ld1r { v17.4s }, [x17]
- ld1r { v10.4s }, [x18]
- ld1r { v11.4s }, [x0]
- ld1r { v19.4s }, [x3]
- ld1r { v18.4s }, [x15]
- ld1r { v16.4s }, [x2]
- cbz x22, .LBB2_7
+ dup v0.4s, w16
+ stp q0, q14, [sp, #48]
+ b .LBB3_3
+.LBB3_2:
+ zip1 v0.4s, v29.4s, v8.4s
+ add x15, x4, #4
+ zip1 v1.4s, v30.4s, v31.4s
+ tst w5, #0x1
+ zip1 v2.4s, v24.4s, v18.4s
+ csel x4, x15, x4, ne
+ zip1 v3.4s, v25.4s, v26.4s
+ add x0, x0, #32
+ zip2 v6.4s, v29.4s, v8.4s
+ sub x1, x1, #4
+ zip1 v4.2d, v0.2d, v1.2d
+ cmp x1, #3
+ zip2 v7.4s, v30.4s, v31.4s
+ zip1 v5.2d, v2.2d, v3.2d
+ zip2 v0.2d, v0.2d, v1.2d
+ zip2 v1.2d, v2.2d, v3.2d
+ zip2 v2.4s, v24.4s, v18.4s
+ zip2 v3.4s, v25.4s, v26.4s
+ stp q4, q5, [x8]
+ zip2 v4.2d, v6.2d, v7.2d
+ stp q0, q1, [x8, #32]
+ zip1 v0.2d, v6.2d, v7.2d
+ zip1 v1.2d, v2.2d, v3.2d
+ zip2 v2.2d, v2.2d, v3.2d
+ stp q0, q1, [x8, #64]
+ stp q4, q2, [x8, #96]
+ add x8, x8, #128
+ b.ls .LBB3_6
+.LBB3_3:
+ mov x15, x3
+ add x16, x3, #8
+ add x17, x3, #12
+ add x19, x3, #16
+ add x20, x3, #20
+ ld1r { v29.4s }, [x15], #4
+ ld1r { v30.4s }, [x16]
+ add x16, x3, #24
+ ld1r { v31.4s }, [x17]
+ add x17, x3, #28
+ ld1r { v24.4s }, [x19]
+ ld1r { v18.4s }, [x20]
+ ld1r { v25.4s }, [x16]
+ ld1r { v8.4s }, [x15]
+ ld1r { v26.4s }, [x17]
+ cbz x2, .LBB3_2
ldr q1, [sp, #16]
- dup v0.4s, w20
- ldp x15, x16, [x24]
- ldp x17, x18, [x24, #16]
+ dup v0.4s, w4
+ lsr x17, x4, #32
+ mov x15, xzr
+ ldp x19, x20, [x0, #16]
add v1.4s, v0.4s, v1.4s
+ mov x21, x2
movi v0.4s, #128, lsl #24
- str q1, [sp, #64]
+ mov w26, w12
+ str q1, [sp, #96]
eor v0.16b, v1.16b, v0.16b
ldr q1, [sp]
- lsr x2, x20, #32
- mov x0, xzr
- mov w6, w8
cmgt v0.4s, v1.4s, v0.4s
- dup v1.4s, w2
+ dup v1.4s, w17
+ ldp x16, x17, [x0]
sub v0.4s, v1.4s, v0.4s
- str q0, [sp, #48]
-.LBB2_4:
- mov w4, #16
- stp q16, q17, [sp, #192]
- bfi x4, x0, #6, #58
- ldr q1, [x15, x4]
- ldr q3, [x16, x4]
- ldr q2, [x17, x4]
- ldr q4, [x18, x4]
- mov w4, #32
- bfi x4, x0, #6, #58
- ldr q5, [x15, x4]
- ldr q20, [x16, x4]
- ldr q21, [x17, x4]
- ldr q22, [x18, x4]
- mov w4, #48
- lsl x3, x0, #6
- bfi x4, x0, #6, #58
- add x0, x0, #1
- ldr q0, [x15, x3]
- ldr q23, [x16, x3]
- ldr q16, [x17, x3]
- ldr q17, [x18, x3]
- cmp x0, x22
- ldr q25, [x15, x4]
- ldr q14, [x16, x4]
- ldr q28, [x17, x4]
- ldr q31, [x18, x4]
- csel w4, w27, wzr, eq
- orr w4, w4, w6
- mov x2, xzr
- and w6, w4, #0xff
- add x3, x3, #256
-.LBB2_5:
- ldr x4, [x24, x2]
- add x2, x2, #8
- cmp x2, #32
- add x4, x4, x3
- prfm pldl1keep, [x4]
- b.ne .LBB2_5
- zip1 v29.4s, v0.4s, v23.4s
- zip2 v23.4s, v0.4s, v23.4s
- zip1 v0.4s, v16.4s, v17.4s
- zip2 v24.4s, v16.4s, v17.4s
- zip1 v9.4s, v1.4s, v3.4s
- zip2 v26.4s, v1.4s, v3.4s
- zip1 v27.4s, v2.4s, v4.4s
- zip2 v17.4s, v2.4s, v4.4s
- zip1 v12.4s, v21.4s, v22.4s
- zip2 v13.4s, v21.4s, v22.4s
- add v2.4s, v7.4s, v10.4s
- add v1.4s, v18.4s, v11.4s
- ext v7.16b, v0.16b, v29.16b, #8
- ext v22.16b, v24.16b, v23.16b, #8
- zip1 v30.4s, v5.4s, v20.4s
- zip2 v20.4s, v5.4s, v20.4s
- stp q1, q2, [sp, #112]
- ext v2.16b, v29.16b, v7.16b, #8
- mov v29.d[1], v0.d[0]
- ext v18.16b, v23.16b, v22.16b, #8
- mov v23.d[1], v24.d[0]
- zip1 v21.4s, v25.4s, v14.4s
- zip2 v4.4s, v25.4s, v14.4s
- zip1 v14.4s, v28.4s, v31.4s
- zip2 v15.4s, v28.4s, v31.4s
- add v8.4s, v6.4s, v19.4s
- ext v28.16b, v27.16b, v9.16b, #8
- ext v31.16b, v17.16b, v26.16b, #8
- stur q2, [x29, #-208]
- mov v7.16b, v29.16b
- ext v0.16b, v12.16b, v30.16b, #8
- stp q23, q29, [x29, #-80]
- mov v2.16b, v19.16b
- ext v19.16b, v13.16b, v20.16b, #8
- mov v29.16b, v9.16b
- ext v25.16b, v9.16b, v28.16b, #8
- mov v29.d[1], v27.d[0]
- ext v24.16b, v26.16b, v31.16b, #8
- mov v26.d[1], v17.d[0]
- ext v17.16b, v15.16b, v4.16b, #8
- ext v27.16b, v30.16b, v0.16b, #8
- ext v0.16b, v20.16b, v19.16b, #8
- stp q0, q25, [sp, #80]
- ext v0.16b, v4.16b, v17.16b, #8
- str q0, [sp, #224]
- ldr q0, [sp, #128]
- mov v6.16b, v23.16b
- mov v22.16b, v4.16b
- ldr q16, [x9, :lo12:.LCPI2_1]
- add v17.4s, v0.4s, v7.4s
- ldr q0, [sp, #112]
- mov v30.d[1], v12.d[0]
- add v7.4s, v8.4s, v29.4s
- mov v20.d[1], v13.d[0]
- add v4.4s, v0.4s, v6.4s
- ldr q0, [sp, #64]
- dup v3.4s, w12
- ext v28.16b, v14.16b, v21.16b, #8
- dup v1.4s, w10
- eor v19.16b, v17.16b, v0.16b
- ldr q0, [sp, #48]
- ext v23.16b, v21.16b, v28.16b, #8
- mov v21.d[1], v14.d[0]
- tbl v14.16b, { v19.16b }, v16.16b
- eor v12.16b, v4.16b, v0.16b
- movi v0.4s, #64
- eor v13.16b, v7.16b, v0.16b
- tbl v13.16b, { v13.16b }, v16.16b
- add v6.4s, v13.4s, v3.4s
- dup v5.4s, w11
- tbl v12.16b, { v12.16b }, v16.16b
- add v1.4s, v14.4s, v1.4s
- eor v9.16b, v6.16b, v2.16b
- ldp q2, q0, [sp, #192]
- add v5.4s, v12.4s, v5.4s
- eor v19.16b, v1.16b, v10.16b
- eor v10.16b, v5.16b, v11.16b
- ushr v11.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v11.16b, v19.16b, v11.16b
- ushr v19.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- mov v22.d[1], v15.d[0]
- orr v10.16b, v10.16b, v19.16b
- ushr v19.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- add v15.4s, v0.4s, v2.4s
- orr v9.16b, v9.16b, v19.16b
- dup v19.4s, w6
- add v15.4s, v15.4s, v26.4s
- eor v19.16b, v15.16b, v19.16b
- tbl v3.16b, { v19.16b }, v16.16b
- dup v19.4s, w13
- add v8.4s, v3.4s, v19.4s
- ldur q31, [x29, #-208]
- eor v19.16b, v8.16b, v2.16b
- ushr v0.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v2.16b, v19.16b, v0.16b
- ldr q19, [x14, :lo12:.LCPI2_2]
- add v17.4s, v17.4s, v31.4s
- add v17.4s, v17.4s, v11.4s
- eor v14.16b, v14.16b, v17.16b
- tbl v14.16b, { v14.16b }, v19.16b
- add v1.4s, v1.4s, v14.4s
- eor v11.16b, v1.16b, v11.16b
- add v4.4s, v4.4s, v18.4s
- ushr v0.4s, v11.4s, #7
- shl v11.4s, v11.4s, #25
- add v4.4s, v4.4s, v10.4s
- orr v0.16b, v11.16b, v0.16b
- eor v11.16b, v12.16b, v4.16b
- tbl v11.16b, { v11.16b }, v19.16b
- add v5.4s, v5.4s, v11.4s
- eor v10.16b, v5.16b, v10.16b
- add v7.4s, v7.4s, v25.4s
- ushr v12.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- add v7.4s, v7.4s, v9.4s
- orr v10.16b, v10.16b, v12.16b
- eor v12.16b, v13.16b, v7.16b
- tbl v12.16b, { v12.16b }, v19.16b
- add v6.4s, v6.4s, v12.4s
- eor v9.16b, v6.16b, v9.16b
- ushr v13.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- orr v9.16b, v9.16b, v13.16b
- add v13.4s, v15.4s, v24.4s
- add v13.4s, v13.4s, v2.4s
- eor v3.16b, v3.16b, v13.16b
- tbl v3.16b, { v3.16b }, v19.16b
- add v8.4s, v8.4s, v3.4s
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v30.4s
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v21.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v22.4s
- mov v28.16b, v26.16b
- stur q26, [x29, #-112]
- mov v26.16b, v18.16b
- mov v18.16b, v24.16b
- stur q24, [x29, #-160]
- add v6.4s, v6.4s, v3.4s
- mov v24.16b, v20.16b
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldr q20, [sp, #80]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- stp q30, q22, [x29, #-192]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- mov v30.16b, v27.16b
- add v17.4s, v17.4s, v27.4s
- ldr q27, [sp, #224]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
- shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v23.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v27.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- stur q21, [x29, #-144]
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- ldur q21, [x29, #-80]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
+ str q0, [sp, #80]
+.LBB3_5:
+ add x23, x16, x15
+ add x24, x17, x15
+ add x22, x19, x15
+ add x25, x20, x15
+ subs x21, x21, #1
+ add x15, x15, #64
+ ldp q1, q2, [x23]
+ csel w27, w9, wzr, eq
+ orr w26, w27, w26
+ and w26, w26, #0xff
+ ldp q4, q5, [x24]
+ dup v0.4s, w26
+ mov w26, w6
+ zip1 v22.4s, v1.4s, v4.4s
+ zip2 v20.4s, v1.4s, v4.4s
+ ldp q6, q7, [x22]
+ zip1 v17.4s, v2.4s, v5.4s
+ zip2 v23.4s, v2.4s, v5.4s
+ ldp q16, q21, [x25]
+ zip1 v19.4s, v6.4s, v16.4s
+ zip2 v1.4s, v6.4s, v16.4s
+ ldp q27, q28, [x23, #32]
+ zip1 v4.4s, v7.4s, v21.4s
+ zip2 v5.4s, v7.4s, v21.4s
+ zip2 v15.2d, v17.2d, v4.2d
+ ldp q9, q10, [x24, #32]
+ mov v17.d[1], v4.d[0]
+ add v4.4s, v30.4s, v25.4s
+ zip2 v11.2d, v23.2d, v5.2d
+ zip2 v3.4s, v27.4s, v9.4s
+ zip1 v7.4s, v27.4s, v9.4s
+ ldp q12, q6, [x22, #32]
+ mov v23.d[1], v5.d[0]
+ stp q11, q3, [sp, #256]
+ add v5.4s, v31.4s, v26.4s
+ add v4.4s, v4.4s, v17.4s
+ str q23, [sp, #352]
+ ldp q16, q2, [x25, #32]
+ add v5.4s, v5.4s, v23.4s
+ zip1 v3.4s, v12.4s, v16.4s
eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v21.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v26.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v18.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v29.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
+ zip1 v9.4s, v6.4s, v2.4s
+ zip2 v2.4s, v6.4s, v2.4s
+ stp q7, q3, [sp, #208]
+ zip2 v3.4s, v12.4s, v16.4s
+ zip1 v12.4s, v28.4s, v10.4s
+ zip2 v10.4s, v28.4s, v10.4s
+ stp q17, q2, [sp, #160]
+ zip2 v28.2d, v22.2d, v19.2d
+ mov v22.d[1], v19.d[0]
+ str q3, [sp, #240]
+ add v2.4s, v8.4s, v18.4s
+ eor v16.16b, v4.16b, v13.16b
+ dup v17.4s, w13
+ mov v3.16b, v22.16b
+ stp q22, q28, [sp, #320]
+ zip2 v22.2d, v20.2d, v1.2d
+ mov v20.d[1], v1.d[0]
+ add v1.4s, v29.4s, v24.4s
+ add v4.4s, v4.4s, v15.4s
add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-64]
- ushr v15.4s, v0.4s, #12
+ add v2.4s, v2.4s, v20.4s
+ stp q15, q20, [sp, #288]
+ add v1.4s, v1.4s, v3.4s
+ ldr q3, [sp, #96]
+ dup v20.4s, w14
+ mov v23.16b, v22.16b
+ mov v15.16b, v10.16b
+ eor v6.16b, v1.16b, v3.16b
+ ldr q3, [sp, #80]
+ add v1.4s, v1.4s, v28.4s
+ ldr q28, [sp, #272]
+ str q23, [sp, #128]
+ eor v7.16b, v2.16b, v3.16b
+ ldp q27, q3, [sp, #32]
+ add v2.4s, v2.4s, v22.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v7.16b, { v7.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v0.16b, { v0.16b }, v27.16b
+ add v19.4s, v6.4s, v14.4s
+ add v21.4s, v7.4s, v3.4s
+ add v30.4s, v16.4s, v17.4s
+ add v31.4s, v0.4s, v20.4s
+ eor v24.16b, v19.16b, v24.16b
+ eor v17.16b, v21.16b, v18.16b
+ ushr v18.4s, v24.4s, #12
+ shl v20.4s, v24.4s, #20
+ eor v24.16b, v30.16b, v25.16b
+ eor v25.16b, v31.16b, v26.16b
+ ushr v26.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ ushr v29.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ ushr v8.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ orr v3.16b, v20.16b, v18.16b
+ ldr q18, [x10, :lo12:.LCPI3_2]
+ orr v13.16b, v17.16b, v26.16b
+ orr v24.16b, v24.16b, v29.16b
+ orr v14.16b, v25.16b, v8.16b
+ add v8.4s, v1.4s, v3.4s
+ add v29.4s, v2.4s, v13.4s
+ add v17.4s, v4.4s, v24.4s
+ add v20.4s, v5.4s, v14.4s
+ eor v1.16b, v6.16b, v8.16b
+ eor v2.16b, v7.16b, v29.16b
+ eor v4.16b, v16.16b, v17.16b
+ eor v0.16b, v0.16b, v20.16b
+ tbl v25.16b, { v1.16b }, v18.16b
+ tbl v16.16b, { v2.16b }, v18.16b
+ tbl v6.16b, { v4.16b }, v18.16b
+ tbl v4.16b, { v0.16b }, v18.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v30.4s, v6.4s
+ add v7.4s, v31.4s, v4.4s
+ eor v0.16b, v19.16b, v3.16b
+ eor v1.16b, v21.16b, v13.16b
+ eor v2.16b, v26.16b, v24.16b
+ eor v3.16b, v7.16b, v14.16b
+ ushr v5.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v24.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v30.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v5.16b, v0.16b, v5.16b
+ orr v0.16b, v1.16b, v24.16b
+ ushr v31.4s, v3.4s, #7
+ orr v2.16b, v2.16b, v30.16b
+ ldp q24, q30, [sp, #208]
+ shl v3.4s, v3.4s, #25
+ zip2 v14.2d, v12.2d, v9.2d
+ mov v22.16b, v24.16b
+ orr v1.16b, v3.16b, v31.16b
+ zip2 v3.2d, v24.2d, v30.2d
+ mov v24.16b, v28.16b
+ mov v22.d[1], v30.d[0]
+ ldr q30, [sp, #240]
+ mov v31.16b, v12.16b
+ stp q22, q14, [sp, #224]
+ mov v24.d[1], v30.d[0]
+ add v12.4s, v8.4s, v22.4s
+ mov v31.d[1], v9.d[0]
+ add v22.4s, v29.4s, v24.4s
+ ldr q29, [sp, #176]
+ zip2 v28.2d, v28.2d, v30.2d
+ mov v9.16b, v24.16b
+ mov v15.d[1], v29.d[0]
+ zip2 v8.2d, v10.2d, v29.2d
+ add v10.4s, v12.4s, v0.4s
+ add v22.4s, v22.4s, v2.4s
+ str q9, [sp, #144]
+ add v20.4s, v20.4s, v15.4s
+ add v17.4s, v17.4s, v31.4s
+ stp q3, q8, [sp, #192]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v28.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v24.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v23.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-144]
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v3.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v28.4s
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v8.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v31.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v27.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldr q27, [sp, #96]
- mov v21.16b, v26.16b
- stur q26, [x29, #-96]
- mov v28.16b, v31.16b
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldp q31, q26, [x29, #-192]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v20.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v27.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v23.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v11.4s
+ mov v30.16b, v28.16b
+ mov v28.16b, v23.16b
+ ldr q23, [sp, #304]
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ mov v29.16b, v31.16b
+ ldr q31, [sp, #160]
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v23.4s
+ orr v1.16b, v1.16b, v12.16b
+ str q29, [sp, #272]
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v31.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v26.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v31.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- mov v18.16b, v24.16b
- mov v24.16b, v20.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ add v22.4s, v22.4s, v24.4s
+ ldr q24, [sp, #320]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v24.4s
+ ldr q24, [sp, #352]
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v24.4s
+ ldr q24, [sp, #336]
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v14.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- ldur q20, [x29, #-160]
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v21.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v18.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v23.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v20.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q25, [x29, #-80]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v10.4s, v10.4s, v24.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v29.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v8.4s
+ ldr q8, [sp, #288]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v3.4s
+ ldr q3, [sp, #352]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v29.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v30.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v8.4s
+ mov v24.16b, v30.16b
+ mov v30.16b, v15.16b
+ add v17.4s, v17.4s, v15.4s
+ ldr q15, [sp, #224]
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ str q30, [sp, #176]
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v15.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
add v7.4s, v7.4s, v25.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v26.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- ldur q25, [x29, #-112]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v30.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v24.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v31.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q25, [x29, #-64]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldr q31, [sp, #224]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v27.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v25.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v9.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v14.4s
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v28.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v11.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v31.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v28.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v23.4s
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v31.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v30.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v26.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v23.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- mov v21.16b, v29.16b
- stur q29, [x29, #-128]
- mov v29.16b, v30.16b
- mov v30.16b, v27.16b
- mov v27.16b, v18.16b
- str q18, [sp, #176]
- eor v0.16b, v0.16b, v1.16b
- mov v18.16b, v22.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-96]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ add v10.4s, v10.4s, v3.4s
+ ldr q3, [sp, #192]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v3.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v15.4s
+ ldr q15, [sp, #128]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v24.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v20.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v29.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v31.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ ldp q23, q11, [sp, #320]
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v8.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v23.4s
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ mov v28.16b, v31.16b
+ mov v31.16b, v8.16b
+ ldr q8, [sp, #208]
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v11.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v21.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v24.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v28.4s
- add v6.4s, v6.4s, v3.4s
- mov v22.16b, v24.16b
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q24, [x29, #-80]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- mov v21.16b, v30.16b
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldur q30, [x29, #-192]
- mov v20.16b, v29.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- ldur q29, [x29, #-112]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v24.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v30.4s
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v9.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v14.4s
+ ldr q14, [sp, #256]
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v29.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v3.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v15.4s
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v14.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v8.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v20.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v31.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v26.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v10.4s, v10.4s, v28.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v24.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v11.4s
+ ldr q11, [sp, #304]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v31.4s
+ ldr q31, [sp, #224]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v23.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v27.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v30.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- ldur q27, [x29, #-160]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v23.4s
+ ldr q23, [sp, #240]
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v11.4s
+ mov v30.16b, v8.16b
+ mov v8.16b, v24.16b
+ ldr q24, [sp, #352]
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ str q8, [sp, #112]
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v24.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v31.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ mov v29.16b, v3.16b
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v27.4s
- mov v28.16b, v25.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v21.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v28.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v29.4s
- mov v25.16b, v31.16b
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q31, [x29, #-96]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldur q28, [x29, #-208]
- mov v18.16b, v20.16b
- str q20, [sp, #144]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- ldur q20, [x29, #-128]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v24.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v31.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v30.4s
+ ldr q30, [sp, #272]
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ mov v3.16b, v28.16b
+ ldr q28, [sp, #176]
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v30.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v28.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v28.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v20.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v8.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v9.4s
+ ldr q9, [sp, #320]
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v23.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v31.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v30.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v25.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- add v17.4s, v17.4s, v26.4s
- mov v26.16b, v21.16b
- add v4.4s, v4.4s, v21.4s
- ldur q21, [x29, #-144]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ add v10.4s, v10.4s, v14.4s
+ ldr q14, [sp, #288]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v14.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v24.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v4.16b, v4.16b, v10.16b
+ add v17.4s, v17.4s, v9.4s
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ add v17.4s, v17.4s, v1.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ add v26.4s, v26.4s, v4.4s
+ eor v16.16b, v16.16b, v17.16b
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ add v19.4s, v19.4s, v16.4s
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v21.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v28.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- str q23, [sp, #160]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- add v17.4s, v17.4s, v23.4s
- ldur q23, [x29, #-64]
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v19.16b, v1.16b
+ add v10.4s, v10.4s, v11.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ add v22.4s, v22.4s, v15.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v20.4s, v20.4s, v3.4s
+ mov v24.16b, v3.16b
+ ldr q3, [sp, #336]
+ orr v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v10.16b
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v3.4s
+ add v20.4s, v20.4s, v5.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v20.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v16.16b, v16.16b, v17.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v7.4s, v7.4s, v25.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v21.4s, v21.4s, v6.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ eor v2.16b, v7.16b, v2.16b
+ add v19.4s, v19.4s, v16.4s
+ eor v5.16b, v21.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v23.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v24.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v20.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- add v13.4s, v13.4s, v0.4s
- ldr q20, [sp, #176]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- eor v12.16b, v12.16b, v13.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- tbl v12.16b, { v12.16b }, v16.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- add v5.4s, v5.4s, v12.4s
+ eor v1.16b, v19.16b, v1.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v8.4s
+ orr v2.16b, v2.16b, v12.16b
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v0.4s
+ add v10.4s, v10.4s, v29.4s
+ ldr q29, [sp, #208]
add v17.4s, v17.4s, v31.4s
- orr v2.16b, v2.16b, v15.16b
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- add v7.4s, v7.4s, v29.4s
- ushr v15.4s, v0.4s, #12
+ orr v1.16b, v1.16b, v12.16b
+ add v20.4s, v20.4s, v29.4s
+ eor v16.16b, v16.16b, v22.16b
+ add v10.4s, v10.4s, v5.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ add v21.4s, v21.4s, v16.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ add v19.4s, v19.4s, v25.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v2.4s
- orr v0.16b, v0.16b, v15.16b
- mov v15.16b, v31.16b
- add v17.4s, v17.4s, v22.4s
- eor v31.16b, v14.16b, v4.16b
- eor v22.16b, v11.16b, v7.16b
- add v11.4s, v13.4s, v27.4s
- tbl v3.16b, { v3.16b }, v19.16b
- add v11.4s, v11.4s, v0.4s
- tbl v31.16b, { v31.16b }, v19.16b
- add v6.4s, v6.4s, v3.4s
- eor v12.16b, v12.16b, v11.16b
- tbl v22.16b, { v22.16b }, v19.16b
- add v8.4s, v8.4s, v31.4s
- eor v10.16b, v6.16b, v10.16b
- add v30.4s, v11.4s, v30.4s
- tbl v11.16b, { v12.16b }, v19.16b
- add v1.4s, v1.4s, v22.4s
- eor v9.16b, v8.16b, v9.16b
- ushr v12.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- add v5.4s, v5.4s, v11.4s
- eor v2.16b, v1.16b, v2.16b
- orr v10.16b, v10.16b, v12.16b
- ushr v12.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v9.16b, v9.16b, v12.16b
- ushr v12.4s, v2.4s, #7
+ eor v5.16b, v5.16b, v19.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v14.4s
+ mov v8.16b, v31.16b
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ mov v31.16b, v14.16b
+ ushr v14.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v0.4s
+ add v10.4s, v10.4s, v28.4s
+ ldr q28, [sp, #352]
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ add v17.4s, v17.4s, v30.4s
+ add v20.4s, v20.4s, v3.4s
+ eor v16.16b, v16.16b, v22.16b
+ add v10.4s, v10.4s, v5.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ add v21.4s, v21.4s, v16.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ add v19.4s, v19.4s, v25.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ ushr v12.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v10.4s, v10.4s, v23.4s
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
+ ushr v14.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v12.16b
+ add v10.4s, v10.4s, v0.4s
+ add v20.4s, v20.4s, v24.4s
+ ldr q24, [sp, #144]
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ add v22.4s, v22.4s, v9.4s
+ add v17.4s, v17.4s, v11.4s
+ eor v4.16b, v4.16b, v10.16b
+ add v20.4s, v20.4s, v5.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v21.4s, v21.4s, v6.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v5.16b, v21.16b, v5.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v15.4s
+ ushr v14.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ mov v30.16b, v3.16b
+ ldr q3, [sp, #256]
+ ushr v12.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v14.16b
+ add v20.4s, v20.4s, v3.4s
orr v2.16b, v2.16b, v12.16b
+ orr v1.16b, v1.16b, v13.16b
+ add v22.4s, v22.4s, v24.4s
+ add v17.4s, v17.4s, v28.4s
+ eor v4.16b, v4.16b, v10.16b
+ add v20.4s, v20.4s, v5.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v21.4s, v21.4s, v6.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
+ eor v5.16b, v21.16b, v5.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
orr v0.16b, v0.16b, v12.16b
- add v4.4s, v4.4s, v26.4s
- add v17.4s, v17.4s, v0.4s
- add v7.4s, v7.4s, v28.4s
- mov v18.16b, v27.16b
- eor v31.16b, v31.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v27.4s, v30.4s, v2.4s
- eor v22.16b, v22.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- eor v3.16b, v3.16b, v27.16b
- add v26.4s, v27.4s, v29.4s
- tbl v27.16b, { v31.16b }, v16.16b
- eor v28.16b, v11.16b, v7.16b
- tbl v22.16b, { v22.16b }, v16.16b
- add v1.4s, v1.4s, v27.4s
- add v4.4s, v4.4s, v23.4s
- ldr q23, [sp, #144]
- tbl v28.16b, { v28.16b }, v16.16b
- tbl v3.16b, { v3.16b }, v16.16b
- add v5.4s, v5.4s, v22.4s
- eor v0.16b, v0.16b, v1.16b
- add v6.4s, v6.4s, v28.4s
- add v29.4s, v8.4s, v3.4s
- eor v30.16b, v5.16b, v10.16b
- ushr v8.4s, v0.4s, #12
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ mov v23.16b, v9.16b
+ ldr q9, [sp, #112]
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ushr v14.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v12.16b
+ add v9.4s, v10.4s, v9.4s
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ ldr q14, [sp, #64]
+ add v22.4s, v22.4s, v31.4s
+ add v17.4s, v17.4s, v30.4s
+ add v20.4s, v20.4s, v8.4s
+ add v9.4s, v9.4s, v5.4s
+ add v22.4s, v22.4s, v0.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ eor v25.16b, v25.16b, v9.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ eor v5.16b, v5.16b, v19.16b
+ eor v0.16b, v21.16b, v0.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ ushr v30.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ushr v10.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v31.16b, v6.16b, v9.16b
- orr v0.16b, v0.16b, v8.16b
- ushr v8.4s, v30.4s, #12
- shl v30.4s, v30.4s, #20
- eor v2.16b, v29.16b, v2.16b
- orr v30.16b, v30.16b, v8.16b
- ushr v8.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- add v17.4s, v17.4s, v25.4s
- add v7.4s, v7.4s, v23.4s
- orr v31.16b, v31.16b, v8.16b
- ushr v8.4s, v2.4s, #12
+ ushr v12.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- ldur q23, [x29, #-176]
- orr v2.16b, v2.16b, v8.16b
- add v17.4s, v17.4s, v0.4s
- eor v27.16b, v27.16b, v17.16b
- add v4.4s, v4.4s, v30.4s
- add v25.4s, v26.4s, v2.4s
- eor v22.16b, v22.16b, v4.16b
- add v4.4s, v4.4s, v24.4s
- add v7.4s, v7.4s, v31.4s
- eor v3.16b, v3.16b, v25.16b
- add v24.4s, v25.4s, v18.4s
- tbl v25.16b, { v27.16b }, v19.16b
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v5.16b, v5.16b, v30.16b
+ add v30.4s, v9.4s, v29.4s
+ add v22.4s, v22.4s, v23.4s
+ ldr q23, [sp, #192]
+ orr v0.16b, v0.16b, v10.16b
+ orr v2.16b, v2.16b, v12.16b
+ orr v1.16b, v1.16b, v13.16b
add v17.4s, v17.4s, v23.4s
- eor v23.16b, v28.16b, v7.16b
- tbl v22.16b, { v22.16b }, v19.16b
- add v1.4s, v1.4s, v25.4s
- tbl v23.16b, { v23.16b }, v19.16b
- tbl v3.16b, { v3.16b }, v19.16b
- add v5.4s, v5.4s, v22.4s
- eor v0.16b, v0.16b, v1.16b
- add v6.4s, v6.4s, v23.4s
- add v26.4s, v29.4s, v3.4s
- eor v27.16b, v5.16b, v30.16b
- ushr v29.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- eor v28.16b, v6.16b, v31.16b
- orr v0.16b, v0.16b, v29.16b
- ushr v29.4s, v27.4s, #7
- shl v27.4s, v27.4s, #25
+ add v20.4s, v20.4s, v28.4s
+ add v23.4s, v30.4s, v5.4s
+ add v22.4s, v22.4s, v0.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ eor v25.16b, v25.16b, v23.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ eor v5.16b, v19.16b, v5.16b
+ eor v0.16b, v21.16b, v0.16b
eor v2.16b, v26.16b, v2.16b
- orr v27.16b, v27.16b, v29.16b
- ushr v29.4s, v28.4s, #7
- shl v28.4s, v28.4s, #25
- ldur q18, [x29, #-128]
- orr v28.16b, v28.16b, v29.16b
- ushr v29.4s, v2.4s, #7
+ eor v1.16b, v7.16b, v1.16b
+ ushr v28.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ushr v30.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v31.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v7.4s, v7.4s, v15.4s
- orr v2.16b, v2.16b, v29.16b
- add v17.4s, v17.4s, v27.4s
- add v4.4s, v4.4s, v28.4s
- add v7.4s, v7.4s, v2.4s
- eor v3.16b, v3.16b, v17.16b
- add v17.4s, v17.4s, v20.4s
- eor v20.16b, v25.16b, v4.16b
- add v4.4s, v4.4s, v21.4s
- eor v21.16b, v22.16b, v7.16b
- add v7.4s, v7.4s, v18.4s
- add v18.4s, v24.4s, v0.4s
- eor v22.16b, v23.16b, v18.16b
- ldr q23, [sp, #160]
- tbl v3.16b, { v3.16b }, v16.16b
- tbl v20.16b, { v20.16b }, v16.16b
- add v6.4s, v6.4s, v3.4s
- add v18.4s, v18.4s, v23.4s
- tbl v21.16b, { v21.16b }, v16.16b
- tbl v16.16b, { v22.16b }, v16.16b
- add v22.4s, v26.4s, v20.4s
- eor v23.16b, v6.16b, v27.16b
- add v1.4s, v1.4s, v21.4s
- eor v24.16b, v22.16b, v28.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v5.4s, v5.4s, v16.4s
- eor v2.16b, v1.16b, v2.16b
- orr v23.16b, v23.16b, v25.16b
- ushr v25.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- eor v0.16b, v5.16b, v0.16b
- orr v24.16b, v24.16b, v25.16b
- ushr v25.4s, v2.4s, #12
+ ushr v8.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v28.16b
+ ldr q28, [sp, #176]
+ orr v0.16b, v0.16b, v30.16b
+ orr v2.16b, v2.16b, v31.16b
+ orr v1.16b, v1.16b, v8.16b
+ add v23.4s, v23.4s, v28.4s
+ add v22.4s, v22.4s, v11.4s
+ add v17.4s, v17.4s, v15.4s
+ add v20.4s, v20.4s, v3.4s
+ ldr q3, [sp, #272]
+ add v23.4s, v23.4s, v0.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ add v20.4s, v20.4s, v5.4s
+ eor v4.16b, v4.16b, v23.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v6.16b, v6.16b, v20.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
+ add v21.4s, v21.4s, v6.4s
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ eor v5.16b, v21.16b, v5.16b
+ add v3.4s, v22.4s, v3.4s
+ ldr q22, [sp, #160]
+ ushr v28.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ ushr v29.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
+ ushr v30.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v31.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v22.4s
+ ldr q22, [sp, #240]
+ orr v0.16b, v0.16b, v28.16b
+ prfm pldl1keep, [x23, #256]
+ orr v2.16b, v2.16b, v29.16b
+ prfm pldl1keep, [x24, #256]
+ orr v1.16b, v1.16b, v30.16b
+ prfm pldl1keep, [x22, #256]
+ orr v5.16b, v5.16b, v31.16b
+ prfm pldl1keep, [x25, #256]
+ add v23.4s, v23.4s, v24.4s
+ add v20.4s, v20.4s, v22.4s
+ add v3.4s, v3.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ add v22.4s, v23.4s, v0.4s
+ add v20.4s, v20.4s, v5.4s
+ eor v23.16b, v25.16b, v3.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v4.16b, v4.16b, v22.16b
+ eor v6.16b, v6.16b, v20.16b
+ tbl v23.16b, { v23.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ add v7.4s, v7.4s, v23.4s
+ add v19.4s, v19.4s, v16.4s
+ add v18.4s, v26.4s, v4.4s
+ add v21.4s, v21.4s, v6.4s
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ eor v0.16b, v18.16b, v0.16b
+ eor v5.16b, v21.16b, v5.16b
+ ushr v25.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v26.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v27.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v0.16b, v0.16b, v24.16b
orr v2.16b, v2.16b, v25.16b
- ushr v25.4s, v0.4s, #12
- shl v0.4s, v0.4s, #20
- orr v0.16b, v0.16b, v25.16b
- add v25.4s, v7.4s, v2.4s
- add v26.4s, v18.4s, v0.4s
- eor v18.16b, v21.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- add v4.4s, v4.4s, v24.4s
- eor v16.16b, v16.16b, v26.16b
- tbl v21.16b, { v18.16b }, v19.16b
- eor v3.16b, v3.16b, v17.16b
- eor v7.16b, v20.16b, v4.16b
- tbl v16.16b, { v16.16b }, v19.16b
- add v1.4s, v1.4s, v21.4s
- tbl v3.16b, { v3.16b }, v19.16b
- tbl v20.16b, { v7.16b }, v19.16b
- eor v2.16b, v1.16b, v2.16b
- eor v7.16b, v1.16b, v17.16b
- add v1.4s, v5.4s, v16.4s
- eor v0.16b, v1.16b, v0.16b
- eor v18.16b, v1.16b, v4.16b
- add v1.4s, v6.4s, v3.4s
- eor v4.16b, v1.16b, v23.16b
- eor v6.16b, v25.16b, v1.16b
- add v1.4s, v22.4s, v20.4s
- eor v5.16b, v1.16b, v24.16b
- eor v17.16b, v26.16b, v1.16b
- ushr v1.4s, v4.4s, #7
+ orr v1.16b, v1.16b, v26.16b
+ orr v5.16b, v5.16b, v27.16b
+ movi v13.4s, #64
+ eor v29.16b, v19.16b, v22.16b
+ eor v8.16b, v21.16b, v3.16b
+ eor v30.16b, v17.16b, v18.16b
+ eor v31.16b, v20.16b, v7.16b
+ eor v24.16b, v5.16b, v23.16b
+ eor v18.16b, v0.16b, v16.16b
+ eor v25.16b, v2.16b, v6.16b
+ eor v26.16b, v1.16b, v4.16b
+ cbnz x21, .LBB3_5
+ b .LBB3_2
+.LBB3_6:
+ cbz x1, .LBB3_14
+ adrp x12, .LCPI3_3
+ ldr q0, [x11, :lo12:.LCPI3_1]
+ orr w11, w7, w6
+ ldr q2, [x10, :lo12:.LCPI3_2]
+ ldr q1, [x12, :lo12:.LCPI3_3]
+ and x12, x5, #0x1
+.LBB3_8:
+ movi v3.4s, #64
+ lsr x13, x4, #32
+ ldp q5, q4, [x3]
+ mov x15, x2
+ mov w14, w11
+ mov v3.s[0], w4
+ ldr x10, [x0]
+ mov v3.s[1], w13
+ b .LBB3_11
+.LBB3_9:
+ orr w14, w14, w9
+.LBB3_10:
+ ldp q6, q7, [x10]
+ mov v16.16b, v3.16b
+ and w14, w14, #0xff
+ add v5.4s, v5.4s, v4.4s
+ mov x15, x13
+ mov v16.s[3], w14
+ add x14, x10, #32
+ uzp1 v17.4s, v6.4s, v7.4s
+ add x10, x10, #64
+ add v5.4s, v5.4s, v17.4s
+ eor v16.16b, v5.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v18.4s, v16.4s, v1.4s
+ eor v19.16b, v18.16b, v4.16b
+ uzp2 v4.4s, v6.4s, v7.4s
+ ushr v6.4s, v19.4s, #12
+ shl v7.4s, v19.4s, #20
+ ld2 { v19.4s, v20.4s }, [x14]
+ add v5.4s, v5.4s, v4.4s
+ mov w14, w6
+ orr v6.16b, v7.16b, v6.16b
+ add v5.4s, v5.4s, v6.4s
+ eor v7.16b, v16.16b, v5.16b
+ add v5.4s, v5.4s, v19.4s
+ tbl v7.16b, { v7.16b }, v2.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ add v16.4s, v18.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v6.16b, v6.16b, v16.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v18.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v18.16b
+ ext v18.16b, v20.16b, v20.16b, #12
+ add v5.4s, v5.4s, v6.4s
+ eor v7.16b, v5.16b, v7.16b
+ add v5.4s, v5.4s, v18.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v6.16b, v6.16b, v16.16b
+ ushr v21.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v21.16b
+ uzp1 v21.4s, v17.4s, v17.4s
+ add v5.4s, v5.4s, v6.4s
+ ext v21.16b, v21.16b, v17.16b, #8
+ eor v7.16b, v7.16b, v5.16b
+ uzp2 v21.4s, v21.4s, v4.4s
+ tbl v7.16b, { v7.16b }, v2.16b
+ add v5.4s, v5.4s, v21.4s
+ add v16.4s, v16.4s, v7.4s
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v6.16b, v6.16b, v16.16b
+ ushr v22.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v22.16b
+ add v22.4s, v5.4s, v6.4s
+ eor v5.16b, v22.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #12
+ tbl v16.16b, { v5.16b }, v0.16b
+ ext v5.16b, v17.16b, v17.16b, #12
+ add v7.4s, v7.4s, v16.4s
+ ext v5.16b, v17.16b, v5.16b, #12
+ ext v17.16b, v19.16b, v19.16b, #12
+ mov v19.16b, v18.16b
+ eor v6.16b, v6.16b, v7.16b
+ rev64 v5.4s, v5.4s
+ mov v19.s[1], v17.s[2]
+ ushr v20.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ trn2 v5.4s, v5.4s, v19.4s
+ orr v6.16b, v6.16b, v20.16b
+ zip1 v20.2d, v18.2d, v4.2d
+ zip2 v4.4s, v4.4s, v18.4s
+ add v19.4s, v6.4s, v5.4s
+ mov v20.s[3], v17.s[3]
+ add v19.4s, v19.4s, v22.4s
+ ext v22.16b, v20.16b, v20.16b, #12
+ eor v16.16b, v16.16b, v19.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v16.16b, { v16.16b }, v2.16b
+ add v7.4s, v7.4s, v16.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v6.16b, v6.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v23.4s, v6.4s, #7
+ shl v24.4s, v6.4s, #25
+ uzp1 v6.4s, v20.4s, v22.4s
+ orr v20.16b, v24.16b, v23.16b
+ add v22.4s, v20.4s, v6.4s
+ add v19.4s, v22.4s, v19.4s
+ eor v16.16b, v19.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v7.4s, v7.4s, v16.4s
+ eor v18.16b, v20.16b, v7.16b
+ zip1 v20.4s, v4.4s, v17.4s
+ zip1 v4.4s, v17.4s, v4.4s
+ ushr v17.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ ext v20.16b, v4.16b, v20.16b, #8
+ orr v4.16b, v18.16b, v17.16b
+ ext v18.16b, v21.16b, v21.16b, #4
+ add v17.4s, v4.4s, v20.4s
+ add v17.4s, v17.4s, v19.4s
+ uzp1 v19.4s, v18.4s, v18.4s
+ eor v16.16b, v16.16b, v17.16b
+ ext v19.16b, v19.16b, v18.16b, #8
+ tbl v16.16b, { v16.16b }, v2.16b
+ uzp2 v19.4s, v19.4s, v5.4s
+ add v7.4s, v7.4s, v16.4s
+ add v17.4s, v17.4s, v19.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v4.16b, v4.16b, v7.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v21.4s, v4.4s, #7
shl v4.4s, v4.4s, #25
- orr v1.16b, v4.16b, v1.16b
- ushr v4.4s, v5.4s, #7
+ orr v4.16b, v4.16b, v21.16b
+ ext v21.16b, v18.16b, v18.16b, #12
+ add v17.4s, v17.4s, v4.4s
+ ext v18.16b, v18.16b, v21.16b, #12
+ mov v21.16b, v20.16b
+ eor v16.16b, v17.16b, v16.16b
+ rev64 v18.4s, v18.4s
+ mov v21.s[1], v6.s[2]
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v7.4s, v7.4s, v16.4s
+ eor v4.16b, v4.16b, v7.16b
+ ushr v22.4s, v4.4s, #12
+ shl v23.4s, v4.4s, #20
+ trn2 v4.4s, v18.4s, v21.4s
+ orr v18.16b, v23.16b, v22.16b
+ add v21.4s, v18.4s, v4.4s
+ add v17.4s, v21.4s, v17.4s
+ zip1 v21.2d, v20.2d, v5.2d
+ zip2 v5.4s, v5.4s, v20.4s
+ eor v16.16b, v16.16b, v17.16b
+ mov v21.s[3], v6.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ zip1 v20.4s, v5.4s, v6.4s
+ tbl v16.16b, { v16.16b }, v2.16b
+ zip1 v5.4s, v6.4s, v5.4s
+ add v22.4s, v7.4s, v16.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ ext v20.16b, v5.16b, v20.16b, #8
+ eor v7.16b, v18.16b, v22.16b
+ ext v18.16b, v21.16b, v21.16b, #12
+ ushr v23.4s, v7.4s, #7
+ shl v24.4s, v7.4s, #25
+ uzp1 v7.4s, v21.4s, v18.4s
+ orr v18.16b, v24.16b, v23.16b
+ add v21.4s, v18.4s, v7.4s
+ add v17.4s, v21.4s, v17.4s
+ ext v21.16b, v22.16b, v22.16b, #4
+ eor v16.16b, v17.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v18.16b, v18.16b, v21.16b
+ ushr v6.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ orr v5.16b, v18.16b, v6.16b
+ add v6.4s, v5.4s, v20.4s
+ add v6.4s, v6.4s, v17.4s
+ ext v17.16b, v19.16b, v19.16b, #4
+ eor v16.16b, v16.16b, v6.16b
+ uzp1 v18.4s, v17.4s, v17.4s
+ tbl v16.16b, { v16.16b }, v2.16b
+ ext v18.16b, v18.16b, v17.16b, #8
+ add v19.4s, v21.4s, v16.4s
+ uzp2 v18.4s, v18.4s, v4.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v5.16b, v5.16b, v19.16b
+ add v6.4s, v6.4s, v18.4s
+ ext v19.16b, v19.16b, v19.16b, #12
+ ushr v21.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
- orr v4.16b, v5.16b, v4.16b
- ushr v5.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- orr v2.16b, v2.16b, v5.16b
- ushr v5.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v5.16b
- eor v10.16b, v0.16b, v20.16b
- eor v11.16b, v1.16b, v21.16b
- eor v19.16b, v4.16b, v16.16b
- cmp x0, x22
- eor v16.16b, v2.16b, v3.16b
- mov w6, w19
- b.ne .LBB2_4
-.LBB2_7:
- zip1 v0.4s, v7.4s, v18.4s
- zip2 v1.4s, v7.4s, v18.4s
- zip1 v2.4s, v6.4s, v17.4s
- zip2 v3.4s, v6.4s, v17.4s
- zip1 v4.4s, v10.4s, v11.4s
- zip2 v5.4s, v10.4s, v11.4s
- zip1 v6.4s, v19.4s, v16.4s
- zip2 v7.4s, v19.4s, v16.4s
- add x15, x20, #4
- tst w5, #0x1
- sub x28, x28, #4
- zip1 v16.2d, v0.2d, v2.2d
- zip2 v0.2d, v0.2d, v2.2d
- zip1 v2.2d, v1.2d, v3.2d
- zip2 v1.2d, v1.2d, v3.2d
- zip1 v3.2d, v4.2d, v6.2d
- zip2 v4.2d, v4.2d, v6.2d
- zip1 v6.2d, v5.2d, v7.2d
- zip2 v5.2d, v5.2d, v7.2d
- add x24, x24, #32
- csel x20, x15, x20, ne
- cmp x28, #3
- stp q16, q3, [x26]
- stp q0, q4, [x26, #32]
- stp q2, q6, [x26, #64]
- stp q1, q5, [x26, #96]
- add x26, x26, #128
- b.hi .LBB2_2
-.LBB2_8:
- cbz x28, .LBB2_16
- orr w8, w7, w19
- and x21, x5, #0x1
- stur w8, [x29, #-64]
-.LBB2_10:
- ldr x8, [sp, #40]
- ldr x25, [x24]
- ldur w4, [x29, #-64]
- ldp q1, q0, [x8]
- mov x8, x22
- stp q1, q0, [x29, #-48]
-.LBB2_11:
- subs x23, x8, #1
- b.eq .LBB2_13
- cbnz x8, .LBB2_14
- b .LBB2_15
-.LBB2_13:
- orr w4, w4, w27
-.LBB2_14:
- sub x0, x29, #48
- mov w2, #64
- mov x1, x25
- mov x3, x20
- bl zfs_blake3_compress_in_place_sse41
- add x25, x25, #64
- mov x8, x23
- mov w4, w19
- b .LBB2_11
-.LBB2_15:
- ldp q0, q1, [x29, #-48]
- add x20, x20, x21
- add x24, x24, #8
- subs x28, x28, #1
- stp q0, q1, [x26], #32
- b.ne .LBB2_10
-.LBB2_16:
- add sp, sp, #448
- ldp x20, x19, [sp, #144]
- ldp x22, x21, [sp, #128]
- ldp x24, x23, [sp, #112]
- ldp x26, x25, [sp, #96]
- ldp x28, x27, [sp, #80]
- ldp x29, x30, [sp, #64]
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v5.16b, v5.16b, v21.16b
+ ext v21.16b, v17.16b, v17.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ ext v17.16b, v17.16b, v21.16b, #12
+ mov v21.16b, v20.16b
+ eor v16.16b, v6.16b, v16.16b
+ rev64 v17.4s, v17.4s
+ mov v21.s[1], v7.s[2]
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v19.4s, v19.4s, v16.4s
+ eor v5.16b, v5.16b, v19.16b
+ ushr v22.4s, v5.4s, #12
+ shl v23.4s, v5.4s, #20
+ trn2 v5.4s, v17.4s, v21.4s
+ orr v17.16b, v23.16b, v22.16b
+ add v21.4s, v17.4s, v5.4s
+ add v6.4s, v21.4s, v6.4s
+ eor v16.16b, v16.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v21.16b, { v16.16b }, v2.16b
+ zip1 v16.2d, v20.2d, v4.2d
+ zip2 v4.4s, v4.4s, v20.4s
+ add v19.4s, v19.4s, v21.4s
+ mov v16.s[3], v7.s[3]
+ ext v21.16b, v21.16b, v21.16b, #8
+ zip1 v20.4s, v4.4s, v7.4s
+ eor v17.16b, v17.16b, v19.16b
+ ext v22.16b, v16.16b, v16.16b, #12
+ ext v19.16b, v19.16b, v19.16b, #4
+ zip1 v4.4s, v7.4s, v4.4s
+ ushr v23.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ uzp1 v16.4s, v16.4s, v22.4s
+ ext v4.16b, v4.16b, v20.16b, #8
+ orr v17.16b, v17.16b, v23.16b
+ add v22.4s, v17.4s, v16.4s
+ add v6.4s, v22.4s, v6.4s
+ eor v21.16b, v6.16b, v21.16b
+ tbl v21.16b, { v21.16b }, v0.16b
+ add v19.4s, v19.4s, v21.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v7.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v7.16b, v17.16b, v7.16b
+ add v17.4s, v7.4s, v4.4s
+ add v6.4s, v17.4s, v6.4s
+ ext v17.16b, v18.16b, v18.16b, #4
+ eor v18.16b, v21.16b, v6.16b
+ uzp1 v20.4s, v17.4s, v17.4s
+ tbl v18.16b, { v18.16b }, v2.16b
+ ext v20.16b, v20.16b, v17.16b, #8
+ add v19.4s, v19.4s, v18.4s
+ uzp2 v20.4s, v20.4s, v5.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v19.16b
+ add v6.4s, v6.4s, v20.4s
+ ushr v21.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v7.16b, v7.16b, v21.16b
+ add v21.4s, v6.4s, v7.4s
+ eor v6.16b, v21.16b, v18.16b
+ ext v18.16b, v19.16b, v19.16b, #12
+ tbl v19.16b, { v6.16b }, v0.16b
+ ext v6.16b, v17.16b, v17.16b, #12
+ add v18.4s, v18.4s, v19.4s
+ ext v6.16b, v17.16b, v6.16b, #12
+ mov v17.16b, v4.16b
+ eor v7.16b, v7.16b, v18.16b
+ rev64 v6.4s, v6.4s
+ mov v17.s[1], v16.s[2]
+ ushr v22.4s, v7.4s, #12
+ shl v7.4s, v7.4s, #20
+ trn2 v6.4s, v6.4s, v17.4s
+ orr v7.16b, v7.16b, v22.16b
+ add v17.4s, v7.4s, v6.4s
+ add v17.4s, v17.4s, v21.4s
+ zip1 v21.2d, v4.2d, v5.2d
+ zip2 v4.4s, v5.4s, v4.4s
+ eor v19.16b, v19.16b, v17.16b
+ mov v21.s[3], v16.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ tbl v19.16b, { v19.16b }, v2.16b
+ ext v22.16b, v21.16b, v21.16b, #12
+ add v18.4s, v18.4s, v19.4s
+ ext v19.16b, v19.16b, v19.16b, #8
+ eor v7.16b, v7.16b, v18.16b
+ ext v18.16b, v18.16b, v18.16b, #4
+ ushr v23.4s, v7.4s, #7
+ shl v24.4s, v7.4s, #25
+ uzp1 v7.4s, v21.4s, v22.4s
+ orr v21.16b, v24.16b, v23.16b
+ add v22.4s, v21.4s, v7.4s
+ add v17.4s, v22.4s, v17.4s
+ eor v19.16b, v17.16b, v19.16b
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v18.4s, v18.4s, v19.4s
+ eor v5.16b, v21.16b, v18.16b
+ zip1 v21.4s, v4.4s, v16.4s
+ zip1 v4.4s, v16.4s, v4.4s
+ ushr v16.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v21.16b, v4.16b, v21.16b, #8
+ orr v4.16b, v5.16b, v16.16b
+ ext v16.16b, v20.16b, v20.16b, #4
+ mov v23.16b, v21.16b
+ add v5.4s, v4.4s, v21.4s
+ mov v23.s[1], v7.s[2]
+ add v5.4s, v5.4s, v17.4s
+ eor v17.16b, v19.16b, v5.16b
+ uzp1 v19.4s, v16.4s, v16.4s
+ tbl v17.16b, { v17.16b }, v2.16b
+ ext v19.16b, v19.16b, v16.16b, #8
+ add v18.4s, v18.4s, v17.4s
+ uzp2 v19.4s, v19.4s, v6.4s
+ eor v4.16b, v4.16b, v18.16b
+ add v5.4s, v5.4s, v19.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ ushr v20.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v4.16b, v20.16b
+ ext v4.16b, v17.16b, v17.16b, #8
+ add v17.4s, v5.4s, v20.4s
+ ext v5.16b, v18.16b, v18.16b, #12
+ eor v4.16b, v17.16b, v4.16b
+ tbl v18.16b, { v4.16b }, v0.16b
+ ext v4.16b, v16.16b, v16.16b, #12
+ add v22.4s, v5.4s, v18.4s
+ ext v4.16b, v16.16b, v4.16b, #12
+ eor v5.16b, v20.16b, v22.16b
+ rev64 v16.4s, v4.4s
+ ushr v20.4s, v5.4s, #12
+ shl v24.4s, v5.4s, #20
+ trn2 v5.4s, v16.4s, v23.4s
+ orr v16.16b, v24.16b, v20.16b
+ add v20.4s, v16.4s, v5.4s
+ add v17.4s, v20.4s, v17.4s
+ zip1 v20.2d, v21.2d, v6.2d
+ zip2 v6.4s, v6.4s, v21.4s
+ eor v18.16b, v18.16b, v17.16b
+ mov v20.s[3], v7.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ zip1 v21.4s, v6.4s, v7.4s
+ tbl v18.16b, { v18.16b }, v2.16b
+ ext v24.16b, v20.16b, v20.16b, #12
+ zip1 v6.4s, v7.4s, v6.4s
+ add v22.4s, v22.4s, v18.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ ext v6.16b, v6.16b, v21.16b, #8
+ eor v16.16b, v16.16b, v22.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ zip1 v5.2d, v6.2d, v5.2d
+ zip2 v4.4s, v4.4s, v6.4s
+ ushr v25.4s, v16.4s, #7
+ shl v26.4s, v16.4s, #25
+ uzp1 v16.4s, v20.4s, v24.4s
+ orr v20.16b, v26.16b, v25.16b
+ mov v5.s[3], v16.s[3]
+ add v24.4s, v20.4s, v16.4s
+ add v17.4s, v24.4s, v17.4s
+ eor v18.16b, v17.16b, v18.16b
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v22.4s, v22.4s, v18.4s
+ eor v20.16b, v20.16b, v22.16b
+ ushr v7.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v7.16b, v20.16b, v7.16b
+ add v20.4s, v7.4s, v6.4s
+ add v17.4s, v20.4s, v17.4s
+ ext v20.16b, v19.16b, v19.16b, #8
+ eor v18.16b, v18.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ tbl v18.16b, { v18.16b }, v2.16b
+ add v21.4s, v22.4s, v18.4s
+ uzp2 v22.4s, v20.4s, v23.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v21.16b
+ ext v20.16b, v22.16b, v20.16b, #4
+ ushr v22.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ add v17.4s, v17.4s, v20.4s
+ ext v20.16b, v21.16b, v21.16b, #12
+ ext v21.16b, v19.16b, v19.16b, #12
+ orr v7.16b, v7.16b, v22.16b
+ ext v19.16b, v19.16b, v21.16b, #12
+ add v17.4s, v17.4s, v7.4s
+ mov v21.16b, v6.16b
+ rev64 v19.4s, v19.4s
+ eor v18.16b, v17.16b, v18.16b
+ mov v21.s[1], v16.s[2]
+ tbl v18.16b, { v18.16b }, v0.16b
+ trn2 v19.4s, v19.4s, v21.4s
+ add v20.4s, v20.4s, v18.4s
+ eor v7.16b, v7.16b, v20.16b
+ ushr v22.4s, v7.4s, #12
+ shl v7.4s, v7.4s, #20
+ orr v7.16b, v7.16b, v22.16b
+ add v19.4s, v7.4s, v19.4s
+ add v17.4s, v19.4s, v17.4s
+ eor v18.16b, v18.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ tbl v18.16b, { v18.16b }, v2.16b
+ add v19.4s, v20.4s, v18.4s
+ ext v20.16b, v5.16b, v5.16b, #12
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v19.16b
+ uzp1 v5.4s, v5.4s, v20.4s
+ ushr v21.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ orr v7.16b, v7.16b, v21.16b
+ add v5.4s, v7.4s, v5.4s
+ add v5.4s, v5.4s, v17.4s
+ eor v17.16b, v5.16b, v18.16b
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v17.16b, { v17.16b }, v0.16b
+ add v18.4s, v18.4s, v17.4s
+ eor v6.16b, v7.16b, v18.16b
+ zip1 v7.4s, v4.4s, v16.4s
+ zip1 v4.4s, v16.4s, v4.4s
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ ext v4.16b, v4.16b, v7.16b, #8
+ orr v6.16b, v6.16b, v16.16b
+ add v4.4s, v6.4s, v4.4s
+ add v4.4s, v4.4s, v5.4s
+ eor v5.16b, v17.16b, v4.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ tbl v5.16b, { v5.16b }, v2.16b
+ add v7.4s, v18.4s, v5.4s
+ eor v6.16b, v6.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v16.16b
+ ext v16.16b, v5.16b, v5.16b, #8
+ eor v5.16b, v4.16b, v7.16b
+ eor v4.16b, v6.16b, v16.16b
+.LBB3_11:
+ subs x13, x15, #1
+ b.eq .LBB3_9
+ cbnz x15, .LBB3_10
+ add x4, x4, x12
+ add x0, x0, #8
+ subs x1, x1, #1
+ stp q5, q4, [x8], #32
+ b.ne .LBB3_8
+.LBB3_14:
+ add sp, sp, #368
+ ldp x20, x19, [sp, #128]
+ ldp x22, x21, [sp, #112]
+ ldp x24, x23, [sp, #96]
+ ldp x26, x25, [sp, #80]
+ ldp x29, x27, [sp, #64]
ldp d9, d8, [sp, #48]
ldp d11, d10, [sp, #32]
ldp d13, d12, [sp, #16]
- ldp d15, d14, [sp], #160
+ ldp d15, d14, [sp], #144
ret
-.Lfunc_end2:
- .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+.Lfunc_end3:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41
.cfi_endproc
.section ".note.GNU-stack","",@progbits
-#endif
+#endif
\ No newline at end of file