]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
authorTino Reichardt <milky-zfs@mcmilk.de>
Wed, 26 Apr 2023 19:40:26 +0000 (21:40 +0200)
committerGitHub <noreply@github.com>
Wed, 26 Apr 2023 19:40:26 +0000 (12:40 -0700)
The x18 register isn't useable within FreeBSD kernel space, so we
have to fix the BLAKE3 aarch64 assembly for not using it.

The source files are here: https://github.com/mcmilk/BLAKE3-tests

Reviewed-by: Kyle Evans <kevans@FreeBSD.org>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #14728

module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S

index 8237f0eb5a4e7a86fffe0ca6fea03a2917de7326..dc2719d142db607dbfb09c504bc0e7c171cf4ed2 100644 (file)
 /*
  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
  * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
- * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de>
  *
  * This is converted assembly: SSE2 -> ARMv8-A
  * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
  */
 
 #if defined(__aarch64__)
        .text
-       .section        .rodata.cst16,"aM",@progbits,16
-       .p2align        4
-.LCPI0_0:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI0_1:
-       .xword  0
-       .xword  -4294967296
-.LCPI0_2:
-       .xword  -1
-       .xword  4294967295
+       .section        .note.gnu.property,"a",@note
+       .p2align        3
+       .word   4
+       .word   16
+       .word   5
+       .asciz  "GNU"
+       .word   3221225472
+       .word   4
+       .word   3
+       .word   0
+.Lsec_end0:
        .text
        .globl  zfs_blake3_compress_in_place_sse2
        .p2align        2
        .type   zfs_blake3_compress_in_place_sse2,@function
 zfs_blake3_compress_in_place_sse2:
        .cfi_startproc
-       ldp     q3, q2, [x0]
-       ldp     q5, q6, [x1]
-       add     x10, x1, #32
-       lsr     x11, x3, #32
-       fmov    s4, w3
-       ld2     { v17.4s, v18.4s }, [x10]
-       adrp    x10, .LCPI0_2
-       and     w8, w2, #0xff
-       mov     v4.s[1], w11
-       ldr     q1, [x10, :lo12:.LCPI0_2]
-       and     w9, w4, #0xff
-       adrp    x12, .LCPI0_0
-       mov     v4.s[2], w8
-       uzp1    v19.4s, v5.4s, v6.4s
-       add     v3.4s, v2.4s, v3.4s
-       ldr     q7, [x12, :lo12:.LCPI0_0]
-       mov     v4.s[3], w9
-       add     v3.4s, v3.4s, v19.4s
-       uzp2    v5.4s, v5.4s, v6.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       uzp1    v6.4s, v19.4s, v19.4s
-       ext     v22.16b, v19.16b, v19.16b, #12
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v20.16b, v17.16b, v17.16b, #12
-       ext     v6.16b, v6.16b, v19.16b, #8
-       ext     v19.16b, v19.16b, v22.16b, #12
-       zip1    v22.2d, v21.2d, v5.2d
-       rev32   v24.8h, v4.8h
-       mov     v4.16b, v1.16b
-       zip2    v23.4s, v5.4s, v21.4s
-       uzp2    v6.4s, v6.4s, v5.4s
-       bsl     v4.16b, v22.16b, v20.16b
-       add     v3.4s, v3.4s, v5.4s
-       zip1    v5.4s, v23.4s, v20.4s
-       zip1    v22.4s, v20.4s, v23.4s
-       add     v23.4s, v24.4s, v7.4s
-       ext     v7.16b, v6.16b, v6.16b, #4
-       ext     v25.16b, v4.16b, v4.16b, #12
-       ext     v5.16b, v22.16b, v5.16b, #8
-       eor     v2.16b, v23.16b, v2.16b
-       uzp1    v4.4s, v4.4s, v25.4s
-       uzp1    v22.4s, v7.4s, v7.4s
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v22.16b, v22.16b, v7.16b, #8
-       ext     v7.16b, v7.16b, v25.16b, #12
-       ushr    v25.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       orr     v2.16b, v2.16b, v25.16b
-       add     v3.4s, v3.4s, v2.4s
-       eor     v24.16b, v3.16b, v24.16b
-       add     v3.4s, v3.4s, v17.4s
-       ushr    v17.4s, v24.4s, #8
-       shl     v18.4s, v24.4s, #24
-       orr     v17.16b, v18.16b, v17.16b
-       add     v18.4s, v17.4s, v23.4s
-       eor     v2.16b, v18.16b, v2.16b
-       ushr    v23.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       ext     v3.16b, v3.16b, v3.16b, #12
-       orr     v2.16b, v2.16b, v23.16b
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v3.4s, v2.4s, v3.4s
-       adrp    x11, .LCPI0_1
-       eor     v17.16b, v3.16b, v17.16b
-       ldr     q16, [x11, :lo12:.LCPI0_1]
-       ext     v18.16b, v18.16b, v18.16b, #4
-       rev32   v24.8h, v17.8h
-       movi    v0.2d, #0xffffffff00000000
-       add     v23.4s, v3.4s, v21.4s
-       mov     v21.s[1], v20.s[2]
-       add     v20.4s, v18.4s, v24.4s
-       bit     v19.16b, v21.16b, v0.16b
-       eor     v3.16b, v20.16b, v2.16b
-       uzp2    v2.4s, v22.4s, v19.4s
-       zip1    v17.2d, v5.2d, v19.2d
-       zip2    v18.4s, v19.4s, v5.4s
-       ushr    v21.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       ext     v22.16b, v2.16b, v2.16b, #4
-       bsl     v16.16b, v4.16b, v17.16b
-       zip1    v17.4s, v18.4s, v4.4s
-       zip1    v18.4s, v4.4s, v18.4s
-       orr     v21.16b, v3.16b, v21.16b
-       ext     v25.16b, v16.16b, v16.16b, #12
-       ext     v3.16b, v18.16b, v17.16b, #8
-       uzp1    v18.4s, v22.4s, v22.4s
-       ext     v26.16b, v22.16b, v22.16b, #12
-       add     v23.4s, v23.4s, v21.4s
-       uzp1    v17.4s, v16.4s, v25.4s
-       ext     v16.16b, v18.16b, v22.16b, #8
-       ext     v18.16b, v22.16b, v26.16b, #12
-       eor     v22.16b, v23.16b, v24.16b
-       add     v6.4s, v23.4s, v6.4s
-       ushr    v23.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v23.16b
-       add     v20.4s, v22.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v23.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v21.16b, v21.16b, v23.16b
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v21.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v20.16b, v20.16b, v20.16b, #12
-       add     v6.4s, v6.4s, v19.4s
-       rev32   v19.8h, v22.8h
-       add     v20.4s, v20.4s, v19.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v22.4s, v21.4s, #12
-       shl     v21.4s, v21.4s, #20
-       orr     v21.16b, v21.16b, v22.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ushr    v22.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v22.16b
-       add     v20.4s, v19.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       ushr    v22.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       add     v6.4s, v6.4s, v4.4s
-       orr     v21.16b, v21.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ext     v20.16b, v20.16b, v20.16b, #4
-       rev32   v19.8h, v19.8h
-       add     v20.4s, v20.4s, v19.4s
-       add     v6.4s, v6.4s, v5.4s
-       mov     v5.s[1], v4.s[2]
-       eor     v4.16b, v20.16b, v21.16b
-       ushr    v21.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v21.16b, v4.16b, v21.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       add     v2.4s, v6.4s, v2.4s
-       ushr    v6.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v6.16b, v19.16b, v6.16b
-       add     v19.4s, v6.4s, v20.4s
-       eor     v20.16b, v19.16b, v21.16b
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v20.4s, v2.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       rev32   v6.8h, v6.8h
-       add     v19.4s, v19.4s, v6.4s
-       mov     v22.16b, v0.16b
-       eor     v20.16b, v19.16b, v20.16b
-       bsl     v22.16b, v5.16b, v7.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       add     v2.4s, v2.4s, v22.4s
-       orr     v20.16b, v20.16b, v21.16b
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ushr    v21.4s, v6.4s, #8
-       shl     v6.4s, v6.4s, #24
-       orr     v6.16b, v6.16b, v21.16b
-       add     v19.4s, v6.4s, v19.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v2.4s, v2.4s, v17.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       uzp2    v5.4s, v16.4s, v22.4s
-       zip1    v7.2d, v3.2d, v22.2d
-       zip2    v16.4s, v22.4s, v3.4s
-       ext     v19.16b, v19.16b, v19.16b, #4
-       rev32   v22.8h, v6.8h
-       ext     v23.16b, v5.16b, v5.16b, #4
-       bif     v7.16b, v17.16b, v1.16b
-       zip1    v24.4s, v16.4s, v17.4s
-       zip1    v16.4s, v17.4s, v16.4s
-       add     v21.4s, v2.4s, v3.4s
-       mov     v3.s[1], v17.s[2]
-       add     v17.4s, v19.4s, v22.4s
-       mov     v19.16b, v0.16b
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v4.16b, v16.16b, v24.16b, #8
-       uzp1    v16.4s, v23.4s, v23.4s
-       bsl     v19.16b, v3.16b, v18.16b
-       eor     v2.16b, v17.16b, v20.16b
-       uzp1    v7.4s, v7.4s, v25.4s
-       ext     v25.16b, v16.16b, v23.16b, #8
-       zip1    v3.2d, v4.2d, v19.2d
-       ushr    v20.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp2    v6.4s, v25.4s, v19.4s
-       zip2    v18.4s, v19.4s, v4.4s
-       bif     v3.16b, v7.16b, v1.16b
-       orr     v20.16b, v2.16b, v20.16b
-       ext     v16.16b, v23.16b, v24.16b, #12
-       ext     v23.16b, v6.16b, v6.16b, #4
-       zip1    v24.4s, v18.4s, v7.4s
-       zip1    v18.4s, v7.4s, v18.4s
-       ext     v25.16b, v3.16b, v3.16b, #12
-       add     v21.4s, v21.4s, v20.4s
-       ext     v2.16b, v18.16b, v24.16b, #8
-       uzp1    v18.4s, v23.4s, v23.4s
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp1    v3.4s, v3.4s, v25.4s
-       eor     v22.16b, v21.16b, v22.16b
-       ext     v25.16b, v18.16b, v23.16b, #8
-       dup     v18.4s, v2.s[3]
-       ext     v23.16b, v23.16b, v24.16b, #12
-       add     v5.4s, v21.4s, v5.4s
-       trn1    v21.4s, v3.4s, v3.4s
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       ext     v18.16b, v21.16b, v18.16b, #8
-       orr     v21.16b, v22.16b, v24.16b
-       add     v17.4s, v21.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v22.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v20.16b, v20.16b, v22.16b
-       ext     v21.16b, v21.16b, v21.16b, #8
-       add     v5.4s, v20.4s, v5.4s
-       eor     v21.16b, v5.16b, v21.16b
-       ext     v17.16b, v17.16b, v17.16b, #12
-       add     v5.4s, v5.4s, v19.4s
-       rev32   v19.8h, v21.8h
-       add     v17.4s, v17.4s, v19.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v21.16b
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ushr    v21.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v21.16b
-       add     v17.4s, v19.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ext     v5.16b, v5.16b, v5.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v5.4s, v5.4s, v7.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ext     v17.16b, v17.16b, v17.16b, #4
-       rev32   v22.8h, v19.8h
-       add     v21.4s, v5.4s, v4.4s
-       mov     v4.s[1], v7.s[2]
-       add     v19.4s, v17.4s, v22.4s
-       bit     v16.16b, v4.16b, v0.16b
-       eor     v5.16b, v19.16b, v20.16b
-       uzp2    v4.4s, v25.4s, v16.4s
-       zip1    v7.2d, v2.2d, v16.2d
-       zip2    v17.4s, v16.4s, v2.4s
-       ushr    v20.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       ext     v24.16b, v4.16b, v4.16b, #4
-       bif     v7.16b, v3.16b, v1.16b
-       zip1    v25.4s, v17.4s, v3.4s
-       zip1    v17.4s, v3.4s, v17.4s
-       orr     v20.16b, v5.16b, v20.16b
-       ext     v26.16b, v7.16b, v7.16b, #12
-       ext     v5.16b, v17.16b, v25.16b, #8
-       uzp1    v17.4s, v24.4s, v24.4s
-       ext     v25.16b, v24.16b, v24.16b, #12
-       bit     v23.16b, v18.16b, v0.16b
-       add     v21.4s, v21.4s, v20.4s
-       uzp1    v7.4s, v7.4s, v26.4s
-       ext     v26.16b, v17.16b, v24.16b, #8
-       ext     v17.16b, v24.16b, v25.16b, #12
-       eor     v22.16b, v21.16b, v22.16b
-       add     v6.4s, v21.4s, v6.4s
-       zip1    v21.2d, v5.2d, v23.2d
-       zip2    v24.4s, v23.4s, v5.4s
-       bif     v21.16b, v7.16b, v1.16b
-       zip1    v1.4s, v24.4s, v7.4s
-       zip1    v24.4s, v7.4s, v24.4s
-       ext     v1.16b, v24.16b, v1.16b, #8
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v24.16b
-       add     v19.4s, v22.4s, v19.4s
-       ext     v24.16b, v21.16b, v21.16b, #12
-       eor     v20.16b, v19.16b, v20.16b
-       uzp1    v21.4s, v21.4s, v24.4s
-       ushr    v24.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       orr     v20.16b, v20.16b, v24.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v20.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       add     v6.4s, v6.4s, v16.4s
-       rev32   v16.8h, v22.8h
-       add     v19.4s, v19.4s, v16.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ushr    v22.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v22.16b
-       add     v6.4s, v6.4s, v20.4s
-       eor     v16.16b, v6.16b, v16.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v6.4s, v3.4s
-       ushr    v6.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       orr     v6.16b, v16.16b, v6.16b
-       add     v16.4s, v6.4s, v19.4s
-       eor     v19.16b, v16.16b, v20.16b
-       ushr    v20.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v20.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v3.4s, v3.4s, v19.4s
-       eor     v6.16b, v3.16b, v6.16b
-       ext     v16.16b, v16.16b, v16.16b, #4
-       add     v2.4s, v3.4s, v2.4s
-       rev32   v3.8h, v6.8h
-       add     v6.4s, v16.4s, v3.4s
-       eor     v16.16b, v6.16b, v19.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       orr     v16.16b, v16.16b, v19.16b
-       add     v2.4s, v2.4s, v16.4s
-       eor     v3.16b, v2.16b, v3.16b
-       add     v2.4s, v2.4s, v4.4s
-       ushr    v4.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v4.16b
-       add     v4.4s, v3.4s, v6.4s
-       eor     v6.16b, v4.16b, v16.16b
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v6.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
-       rev32   v3.8h, v3.8h
-       add     v4.4s, v4.4s, v3.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ushr    v16.4s, v6.4s, #12
-       shl     v6.4s, v6.4s, #20
-       add     v2.4s, v2.4s, v23.4s
-       orr     v6.16b, v6.16b, v16.16b
-       add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v16.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v16.16b
-       add     v4.4s, v3.4s, v4.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       add     v2.4s, v2.4s, v7.4s
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #4
-       rev32   v3.8h, v3.8h
-       add     v2.4s, v2.4s, v5.4s
-       mov     v5.s[1], v7.s[2]
-       add     v4.4s, v4.4s, v3.4s
-       bsl     v0.16b, v5.16b, v17.16b
-       eor     v5.16b, v4.16b, v6.16b
-       ushr    v6.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v6.16b
-       add     v2.4s, v2.4s, v5.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v6.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v6.16b
-       add     v4.4s, v3.4s, v4.4s
-       uzp2    v18.4s, v26.4s, v18.4s
-       eor     v5.16b, v4.16b, v5.16b
-       add     v2.4s, v2.4s, v18.4s
-       ushr    v6.4s, v5.4s, #7
-       shl     v5.4s, v5.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v5.16b, v5.16b, v6.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v5.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
-       add     v0.4s, v2.4s, v0.4s
-       rev32   v2.8h, v3.8h
-       add     v3.4s, v4.4s, v2.4s
-       eor     v4.16b, v3.16b, v5.16b
-       ushr    v5.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v4.16b, v4.16b, v5.16b
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ushr    v5.4s, v2.4s, #8
-       shl     v2.4s, v2.4s, #24
-       orr     v2.16b, v2.16b, v5.16b
-       add     v3.4s, v2.4s, v3.4s
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v0.16b, v0.16b, v0.16b, #12
-       ushr    v5.4s, v4.4s, #7
-       shl     v4.4s, v4.4s, #25
-       add     v0.4s, v0.4s, v21.4s
-       orr     v4.16b, v4.16b, v5.16b
-       ext     v2.16b, v2.16b, v2.16b, #8
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ext     v3.16b, v3.16b, v3.16b, #4
-       add     v0.4s, v0.4s, v1.4s
-       rev32   v1.8h, v2.8h
-       add     v2.4s, v3.4s, v1.4s
-       eor     v3.16b, v2.16b, v4.16b
-       ushr    v4.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       orr     v3.16b, v3.16b, v4.16b
-       add     v0.4s, v0.4s, v3.4s
-       eor     v1.16b, v0.16b, v1.16b
-       ushr    v4.4s, v1.4s, #8
-       shl     v1.4s, v1.4s, #24
-       orr     v1.16b, v1.16b, v4.16b
-       add     v2.4s, v1.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ext     v0.16b, v0.16b, v0.16b, #4
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v4.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       ext     v1.16b, v1.16b, v1.16b, #8
+       hint    #25
+       .cfi_negate_ra_state
+       sub     sp, sp, #96
+       stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
+       str     x19, [sp, #80]
+       .cfi_def_cfa w29, 32
+       .cfi_offset w19, -16
+       .cfi_offset w30, -24
+       .cfi_offset w29, -32
+       mov     x19, x0
+       mov     w5, w4
+       mov     x4, x3
+       mov     w3, w2
+       mov     x2, x1
+       mov     x0, sp
+       mov     x1, x19
+       bl      compress_pre
+       ldp     q0, q1, [sp]
+       ldp     q2, q3, [sp, #32]
        eor     v0.16b, v2.16b, v0.16b
-       orr     v2.16b, v3.16b, v4.16b
-       eor     v1.16b, v2.16b, v1.16b
-       stp     q0, q1, [x0]
+       eor     v1.16b, v3.16b, v1.16b
+       ldp     x29, x30, [sp, #64]
+       stp     q0, q1, [x19]
+       ldr     x19, [sp, #80]
+       add     sp, sp, #96
+       hint    #29
        ret
 .Lfunc_end0:
        .size   zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
@@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2:
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4
 .LCPI1_0:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI1_1:
-       .xword  0
-       .xword  -4294967296
-.LCPI1_2:
-       .xword  -1
-       .xword  4294967295
+       .xword  -4942790177982912921
+       .xword  -6534734903820487822
        .text
-       .globl  zfs_blake3_compress_xof_sse2
        .p2align        2
-       .type   zfs_blake3_compress_xof_sse2,@function
-zfs_blake3_compress_xof_sse2:
+       .type   compress_pre,@function
+compress_pre:
        .cfi_startproc
-       ldp     q3, q2, [x0]
-       ldp     q5, q6, [x1]
-       add     x10, x1, #32
-       lsr     x11, x3, #32
-       fmov    s4, w3
-       ld2     { v17.4s, v18.4s }, [x10]
-       adrp    x10, .LCPI1_2
-       and     w8, w2, #0xff
-       mov     v4.s[1], w11
-       ldr     q1, [x10, :lo12:.LCPI1_2]
-       and     w9, w4, #0xff
-       adrp    x12, .LCPI1_0
-       mov     v4.s[2], w8
-       uzp1    v19.4s, v5.4s, v6.4s
-       add     v3.4s, v2.4s, v3.4s
-       ldr     q7, [x12, :lo12:.LCPI1_0]
-       mov     v4.s[3], w9
-       add     v3.4s, v3.4s, v19.4s
-       uzp2    v5.4s, v5.4s, v6.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       uzp1    v6.4s, v19.4s, v19.4s
-       ext     v22.16b, v19.16b, v19.16b, #12
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v20.16b, v17.16b, v17.16b, #12
-       ext     v6.16b, v6.16b, v19.16b, #8
-       ext     v19.16b, v19.16b, v22.16b, #12
-       zip1    v22.2d, v21.2d, v5.2d
-       rev32   v24.8h, v4.8h
-       mov     v4.16b, v1.16b
-       zip2    v23.4s, v5.4s, v21.4s
-       uzp2    v6.4s, v6.4s, v5.4s
-       bsl     v4.16b, v22.16b, v20.16b
-       add     v3.4s, v3.4s, v5.4s
-       zip1    v5.4s, v23.4s, v20.4s
-       zip1    v22.4s, v20.4s, v23.4s
-       add     v23.4s, v24.4s, v7.4s
-       ext     v7.16b, v6.16b, v6.16b, #4
-       ext     v25.16b, v4.16b, v4.16b, #12
-       ext     v5.16b, v22.16b, v5.16b, #8
-       eor     v2.16b, v23.16b, v2.16b
-       uzp1    v4.4s, v4.4s, v25.4s
-       uzp1    v22.4s, v7.4s, v7.4s
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v22.16b, v22.16b, v7.16b, #8
-       ext     v7.16b, v7.16b, v25.16b, #12
-       ushr    v25.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       orr     v2.16b, v2.16b, v25.16b
-       add     v3.4s, v3.4s, v2.4s
-       eor     v24.16b, v3.16b, v24.16b
-       add     v3.4s, v3.4s, v17.4s
-       ushr    v17.4s, v24.4s, #8
-       shl     v18.4s, v24.4s, #24
-       orr     v17.16b, v18.16b, v17.16b
-       add     v18.4s, v17.4s, v23.4s
-       eor     v2.16b, v18.16b, v2.16b
-       ushr    v23.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       ext     v3.16b, v3.16b, v3.16b, #12
-       orr     v2.16b, v2.16b, v23.16b
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v3.4s, v2.4s, v3.4s
-       adrp    x11, .LCPI1_1
-       eor     v17.16b, v3.16b, v17.16b
-       ldr     q16, [x11, :lo12:.LCPI1_1]
-       ext     v18.16b, v18.16b, v18.16b, #4
-       rev32   v24.8h, v17.8h
-       movi    v0.2d, #0xffffffff00000000
-       add     v23.4s, v3.4s, v21.4s
-       mov     v21.s[1], v20.s[2]
-       add     v20.4s, v18.4s, v24.4s
-       bit     v19.16b, v21.16b, v0.16b
-       eor     v3.16b, v20.16b, v2.16b
-       uzp2    v2.4s, v22.4s, v19.4s
-       zip1    v17.2d, v5.2d, v19.2d
-       zip2    v18.4s, v19.4s, v5.4s
-       ushr    v21.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       ext     v22.16b, v2.16b, v2.16b, #4
-       bsl     v16.16b, v4.16b, v17.16b
-       zip1    v17.4s, v18.4s, v4.4s
-       zip1    v18.4s, v4.4s, v18.4s
-       orr     v21.16b, v3.16b, v21.16b
-       ext     v25.16b, v16.16b, v16.16b, #12
-       ext     v3.16b, v18.16b, v17.16b, #8
-       uzp1    v18.4s, v22.4s, v22.4s
-       ext     v26.16b, v22.16b, v22.16b, #12
-       add     v23.4s, v23.4s, v21.4s
-       uzp1    v17.4s, v16.4s, v25.4s
-       ext     v16.16b, v18.16b, v22.16b, #8
-       ext     v18.16b, v22.16b, v26.16b, #12
-       eor     v22.16b, v23.16b, v24.16b
-       add     v6.4s, v23.4s, v6.4s
-       ushr    v23.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v23.16b
-       add     v20.4s, v22.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v23.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v21.16b, v21.16b, v23.16b
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v21.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v20.16b, v20.16b, v20.16b, #12
-       add     v6.4s, v6.4s, v19.4s
-       rev32   v19.8h, v22.8h
-       add     v20.4s, v20.4s, v19.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ushr    v22.4s, v21.4s, #12
-       shl     v21.4s, v21.4s, #20
-       orr     v21.16b, v21.16b, v22.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ushr    v22.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v22.16b
-       add     v20.4s, v19.4s, v20.4s
-       eor     v21.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       ushr    v22.4s, v21.4s, #7
-       shl     v21.4s, v21.4s, #25
-       add     v6.4s, v6.4s, v4.4s
-       orr     v21.16b, v21.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       ext     v20.16b, v20.16b, v20.16b, #4
-       rev32   v19.8h, v19.8h
-       add     v20.4s, v20.4s, v19.4s
-       add     v6.4s, v6.4s, v5.4s
-       mov     v5.s[1], v4.s[2]
-       eor     v4.16b, v20.16b, v21.16b
-       ushr    v21.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       orr     v21.16b, v4.16b, v21.16b
-       add     v6.4s, v6.4s, v21.4s
-       eor     v19.16b, v6.16b, v19.16b
-       add     v2.4s, v6.4s, v2.4s
-       ushr    v6.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v6.16b, v19.16b, v6.16b
-       add     v19.4s, v6.4s, v20.4s
-       eor     v20.16b, v19.16b, v21.16b
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v20.4s, v2.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       rev32   v6.8h, v6.8h
-       add     v19.4s, v19.4s, v6.4s
-       mov     v22.16b, v0.16b
-       eor     v20.16b, v19.16b, v20.16b
-       bsl     v22.16b, v5.16b, v7.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       add     v2.4s, v2.4s, v22.4s
-       orr     v20.16b, v20.16b, v21.16b
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       ushr    v21.4s, v6.4s, #8
-       shl     v6.4s, v6.4s, #24
-       orr     v6.16b, v6.16b, v21.16b
-       add     v19.4s, v6.4s, v19.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v2.4s, v2.4s, v17.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v2.4s, v2.4s, v20.4s
-       eor     v6.16b, v2.16b, v6.16b
-       uzp2    v5.4s, v16.4s, v22.4s
-       zip1    v7.2d, v3.2d, v22.2d
-       zip2    v16.4s, v22.4s, v3.4s
-       ext     v19.16b, v19.16b, v19.16b, #4
-       rev32   v22.8h, v6.8h
-       ext     v23.16b, v5.16b, v5.16b, #4
-       bif     v7.16b, v17.16b, v1.16b
-       zip1    v24.4s, v16.4s, v17.4s
-       zip1    v16.4s, v17.4s, v16.4s
-       add     v21.4s, v2.4s, v3.4s
-       mov     v3.s[1], v17.s[2]
-       add     v17.4s, v19.4s, v22.4s
-       mov     v19.16b, v0.16b
-       ext     v25.16b, v7.16b, v7.16b, #12
-       ext     v4.16b, v16.16b, v24.16b, #8
-       uzp1    v16.4s, v23.4s, v23.4s
-       bsl     v19.16b, v3.16b, v18.16b
-       eor     v2.16b, v17.16b, v20.16b
-       uzp1    v7.4s, v7.4s, v25.4s
-       ext     v25.16b, v16.16b, v23.16b, #8
-       zip1    v3.2d, v4.2d, v19.2d
-       ushr    v20.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp2    v6.4s, v25.4s, v19.4s
-       zip2    v18.4s, v19.4s, v4.4s
-       bif     v3.16b, v7.16b, v1.16b
-       orr     v20.16b, v2.16b, v20.16b
-       ext     v16.16b, v23.16b, v24.16b, #12
-       ext     v23.16b, v6.16b, v6.16b, #4
-       zip1    v24.4s, v18.4s, v7.4s
-       zip1    v18.4s, v7.4s, v18.4s
-       ext     v25.16b, v3.16b, v3.16b, #12
-       add     v21.4s, v21.4s, v20.4s
-       ext     v2.16b, v18.16b, v24.16b, #8
-       uzp1    v18.4s, v23.4s, v23.4s
-       ext     v24.16b, v23.16b, v23.16b, #12
-       uzp1    v3.4s, v3.4s, v25.4s
-       eor     v22.16b, v21.16b, v22.16b
-       ext     v25.16b, v18.16b, v23.16b, #8
-       dup     v18.4s, v2.s[3]
-       ext     v23.16b, v23.16b, v24.16b, #12
-       add     v5.4s, v21.4s, v5.4s
-       trn1    v21.4s, v3.4s, v3.4s
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       ext     v18.16b, v21.16b, v18.16b, #8
-       orr     v21.16b, v22.16b, v24.16b
-       add     v17.4s, v21.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v22.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v20.16b, v20.16b, v22.16b
-       ext     v21.16b, v21.16b, v21.16b, #8
-       add     v5.4s, v20.4s, v5.4s
-       eor     v21.16b, v5.16b, v21.16b
-       ext     v17.16b, v17.16b, v17.16b, #12
-       add     v5.4s, v5.4s, v19.4s
-       rev32   v19.8h, v21.8h
-       add     v17.4s, v17.4s, v19.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ushr    v21.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v21.16b
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ushr    v21.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v21.16b
-       add     v17.4s, v19.4s, v17.4s
-       eor     v20.16b, v17.16b, v20.16b
-       ext     v5.16b, v5.16b, v5.16b, #12
-       ushr    v21.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       add     v5.4s, v5.4s, v7.4s
-       orr     v20.16b, v20.16b, v21.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v5.4s, v5.4s, v20.4s
-       eor     v19.16b, v5.16b, v19.16b
-       ext     v17.16b, v17.16b, v17.16b, #4
-       rev32   v22.8h, v19.8h
-       add     v21.4s, v5.4s, v4.4s
-       mov     v4.s[1], v7.s[2]
-       add     v19.4s, v17.4s, v22.4s
-       bit     v16.16b, v4.16b, v0.16b
-       eor     v5.16b, v19.16b, v20.16b
-       uzp2    v4.4s, v25.4s, v16.4s
-       zip1    v7.2d, v2.2d, v16.2d
-       zip2    v17.4s, v16.4s, v2.4s
-       ushr    v20.4s, v5.4s, #12
-       shl     v5.4s, v5.4s, #20
-       ext     v24.16b, v4.16b, v4.16b, #4
-       bif     v7.16b, v3.16b, v1.16b
-       zip1    v25.4s, v17.4s, v3.4s
-       zip1    v17.4s, v3.4s, v17.4s
-       orr     v20.16b, v5.16b, v20.16b
-       ext     v26.16b, v7.16b, v7.16b, #12
-       ext     v5.16b, v17.16b, v25.16b, #8
-       uzp1    v17.4s, v24.4s, v24.4s
-       ext     v25.16b, v24.16b, v24.16b, #12
-       bit     v23.16b, v18.16b, v0.16b
-       add     v21.4s, v21.4s, v20.4s
-       uzp1    v7.4s, v7.4s, v26.4s
-       ext     v26.16b, v17.16b, v24.16b, #8
-       ext     v17.16b, v24.16b, v25.16b, #12
-       eor     v22.16b, v21.16b, v22.16b
-       add     v6.4s, v21.4s, v6.4s
-       zip1    v21.2d, v5.2d, v23.2d
-       zip2    v24.4s, v23.4s, v5.4s
-       bif     v21.16b, v7.16b, v1.16b
-       zip1    v1.4s, v24.4s, v7.4s
-       zip1    v24.4s, v7.4s, v24.4s
-       ext     v1.16b, v24.16b, v1.16b, #8
-       ushr    v24.4s, v22.4s, #8
-       shl     v22.4s, v22.4s, #24
-       orr     v22.16b, v22.16b, v24.16b
-       add     v19.4s, v22.4s, v19.4s
-       ext     v24.16b, v21.16b, v21.16b, #12
-       eor     v20.16b, v19.16b, v20.16b
-       uzp1    v21.4s, v21.4s, v24.4s
-       ushr    v24.4s, v20.4s, #7
-       shl     v20.4s, v20.4s, #25
-       orr     v20.16b, v20.16b, v24.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       ext     v22.16b, v22.16b, v22.16b, #8
-       add     v6.4s, v20.4s, v6.4s
-       eor     v22.16b, v6.16b, v22.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       add     v6.4s, v6.4s, v16.4s
-       rev32   v16.8h, v22.8h
-       add     v19.4s, v19.4s, v16.4s
-       eor     v20.16b, v19.16b, v20.16b
-       ushr    v22.4s, v20.4s, #12
-       shl     v20.4s, v20.4s, #20
-       orr     v20.16b, v20.16b, v22.16b
-       add     v6.4s, v6.4s, v20.4s
-       eor     v16.16b, v6.16b, v16.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v6.4s, v3.4s
-       ushr    v6.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       orr     v6.16b, v16.16b, v6.16b
-       add     v16.4s, v6.4s, v19.4s
-       eor     v19.16b, v16.16b, v20.16b
-       ushr    v20.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v20.16b
-       ext     v6.16b, v6.16b, v6.16b, #8
-       add     v3.4s, v3.4s, v19.4s
-       eor     v6.16b, v3.16b, v6.16b
-       ext     v16.16b, v16.16b, v16.16b, #4
-       add     v2.4s, v3.4s, v2.4s
-       rev32   v3.8h, v6.8h
-       add     v6.4s, v16.4s, v3.4s
-       eor     v16.16b, v6.16b, v19.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       orr     v16.16b, v16.16b, v19.16b
-       add     v2.4s, v2.4s, v16.4s
-       eor     v3.16b, v2.16b, v3.16b
-       add     v2.4s, v2.4s, v4.4s
-       ushr    v4.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v4.16b
-       add     v4.4s, v3.4s, v6.4s
-       eor     v6.16b, v4.16b, v16.16b
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v6.16b, v6.16b, v16.16b
-       ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v6.4s, v2.4s
+       hint    #34
+       fmov    s1, w3
+       movi    d0, #0x0000ff000000ff
+       ldr     q2, [x1]
+       fmov    d3, x4
+       adrp    x8, .LCPI1_0
+       mov     v1.s[1], w5
+       str     q2, [x0]
+       ldr     q4, [x8, :lo12:.LCPI1_0]
+       add     x8, x2, #32
+       ldr     q5, [x1, #16]
+       and     v0.8b, v1.8b, v0.8b
+       stp     q5, q4, [x0, #16]
+       mov     v3.d[1], v0.d[0]
+       str     q3, [x0, #48]
+       ldp     q0, q6, [x2]
+       uzp1    v1.4s, v0.4s, v6.4s
+       uzp2    v0.4s, v0.4s, v6.4s
+       add     v2.4s, v2.4s, v1.4s
+       uzp1    v18.4s, v1.4s, v1.4s
+       add     v2.4s, v2.4s, v5.4s
        eor     v3.16b, v2.16b, v3.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
+       add     v2.4s, v2.4s, v0.4s
        rev32   v3.8h, v3.8h
-       add     v4.4s, v4.4s, v3.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ushr    v16.4s, v6.4s, #12
-       shl     v6.4s, v6.4s, #20
-       add     v2.4s, v2.4s, v23.4s
-       orr     v6.16b, v6.16b, v16.16b
-       add     v2.4s, v2.4s, v6.4s
+       add     v4.4s, v3.4s, v4.4s
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v6.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v6.16b
+       add     v2.4s, v2.4s, v5.4s
        eor     v3.16b, v2.16b, v3.16b
-       ushr    v16.4s, v3.4s, #8
+       ushr    v6.4s, v3.4s, #8
        shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v16.16b
+       orr     v3.16b, v3.16b, v6.16b
+       ld2     { v6.4s, v7.4s }, [x8]
        add     v4.4s, v3.4s, v4.4s
-       eor     v6.16b, v4.16b, v6.16b
-       ext     v2.16b, v2.16b, v2.16b, #12
-       ushr    v16.4s, v6.4s, #7
-       shl     v6.4s, v6.4s, #25
-       add     v2.4s, v2.4s, v7.4s
-       orr     v6.16b, v6.16b, v16.16b
        ext     v3.16b, v3.16b, v3.16b, #8
        add     v2.4s, v2.4s, v6.4s
-       eor     v3.16b, v2.16b, v3.16b
+       eor     v5.16b, v4.16b, v5.16b
        ext     v4.16b, v4.16b, v4.16b, #4
-       rev32   v3.8h, v3.8h
+       ext     v6.16b, v6.16b, v6.16b, #12
+       ext     v2.16b, v2.16b, v2.16b, #12
+       ushr    v16.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v16.16b
+       ext     v16.16b, v7.16b, v7.16b, #12
        add     v2.4s, v2.4s, v5.4s
-       mov     v5.s[1], v7.s[2]
+       mov     v7.16b, v16.16b
+       eor     v3.16b, v3.16b, v2.16b
+       add     v2.4s, v2.4s, v16.4s
+       mov     v7.s[1], v6.s[2]
+       rev32   v3.8h, v3.8h
        add     v4.4s, v4.4s, v3.4s
-       bsl     v0.16b, v5.16b, v17.16b
-       eor     v5.16b, v4.16b, v6.16b
-       ushr    v6.4s, v5.4s, #12
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v17.4s, v5.4s, #12
        shl     v5.4s, v5.4s, #20
-       orr     v5.16b, v5.16b, v6.16b
+       orr     v5.16b, v5.16b, v17.16b
        add     v2.4s, v2.4s, v5.4s
        eor     v3.16b, v2.16b, v3.16b
-       ushr    v6.4s, v3.4s, #8
+       ushr    v17.4s, v3.4s, #8
        shl     v3.4s, v3.4s, #24
-       orr     v3.16b, v3.16b, v6.16b
+       orr     v3.16b, v3.16b, v17.16b
+       ext     v17.16b, v18.16b, v1.16b, #8
        add     v4.4s, v3.4s, v4.4s
-       uzp2    v18.4s, v26.4s, v18.4s
+       uzp2    v17.4s, v17.4s, v0.4s
+       ext     v3.16b, v3.16b, v3.16b, #8
        eor     v5.16b, v4.16b, v5.16b
-       add     v2.4s, v2.4s, v18.4s
-       ushr    v6.4s, v5.4s, #7
+       add     v2.4s, v2.4s, v17.4s
+       ext     v4.16b, v4.16b, v4.16b, #12
+       ushr    v18.4s, v5.4s, #7
        shl     v5.4s, v5.4s, #25
        ext     v2.16b, v2.16b, v2.16b, #4
-       orr     v5.16b, v5.16b, v6.16b
+       orr     v5.16b, v5.16b, v18.16b
+       ext     v18.16b, v1.16b, v1.16b, #12
+       add     v2.4s, v2.4s, v5.4s
+       ext     v1.16b, v1.16b, v18.16b, #12
+       zip1    v18.2d, v16.2d, v0.2d
+       zip2    v0.4s, v0.4s, v16.4s
+       eor     v3.16b, v3.16b, v2.16b
+       rev64   v1.4s, v1.4s
+       mov     v18.s[3], v6.s[3]
+       zip1    v16.4s, v0.4s, v6.4s
+       rev32   v3.8h, v3.8h
+       trn2    v1.4s, v1.4s, v7.4s
+       zip1    v0.4s, v6.4s, v0.4s
+       add     v4.4s, v4.4s, v3.4s
+       add     v2.4s, v2.4s, v1.4s
+       ext     v6.16b, v0.16b, v16.16b, #8
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v7.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v7.16b
+       add     v7.4s, v2.4s, v5.4s
+       eor     v2.16b, v7.16b, v3.16b
+       ext     v7.16b, v7.16b, v7.16b, #12
+       ushr    v3.4s, v2.4s, #8
+       shl     v2.4s, v2.4s, #24
+       orr     v3.16b, v2.16b, v3.16b
+       ext     v2.16b, v18.16b, v18.16b, #12
+       add     v4.4s, v3.4s, v4.4s
+       uzp1    v2.4s, v18.4s, v2.4s
        ext     v3.16b, v3.16b, v3.16b, #8
-       add     v2.4s, v5.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
+       eor     v5.16b, v4.16b, v5.16b
+       add     v7.4s, v7.4s, v2.4s
+       ext     v4.16b, v4.16b, v4.16b, #4
+       ushr    v18.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v18.16b
+       add     v7.4s, v7.4s, v5.4s
+       eor     v3.16b, v3.16b, v7.16b
+       add     v7.4s, v7.4s, v6.4s
+       rev32   v3.8h, v3.8h
+       add     v4.4s, v4.4s, v3.4s
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v0.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v0.16b, v5.16b, v0.16b
+       add     v5.4s, v7.4s, v0.4s
+       ext     v7.16b, v17.16b, v17.16b, #4
+       eor     v3.16b, v5.16b, v3.16b
+       uzp1    v17.4s, v7.4s, v7.4s
+       ushr    v16.4s, v3.4s, #8
+       shl     v3.4s, v3.4s, #24
+       orr     v3.16b, v3.16b, v16.16b
+       ext     v16.16b, v17.16b, v7.16b, #8
+       add     v4.4s, v3.4s, v4.4s
+       uzp2    v16.4s, v16.4s, v1.4s
+       ext     v3.16b, v3.16b, v3.16b, #8
+       eor     v0.16b, v4.16b, v0.16b
+       add     v5.4s, v5.4s, v16.4s
+       ext     v4.16b, v4.16b, v4.16b, #12
+       ushr    v17.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ext     v5.16b, v5.16b, v5.16b, #4
+       orr     v0.16b, v0.16b, v17.16b
+       ext     v17.16b, v7.16b, v7.16b, #12
+       add     v5.4s, v5.4s, v0.4s
+       ext     v7.16b, v7.16b, v17.16b, #12
+       mov     v17.16b, v6.16b
+       eor     v3.16b, v3.16b, v5.16b
+       rev64   v7.4s, v7.4s
+       mov     v17.s[1], v2.s[2]
+       rev32   v3.8h, v3.8h
+       add     v4.4s, v4.4s, v3.4s
+       eor     v18.16b, v4.16b, v0.16b
+       trn2    v0.4s, v7.4s, v17.4s
+       ushr    v7.4s, v18.4s, #12
+       shl     v17.4s, v18.4s, #20
+       add     v5.4s, v5.4s, v0.4s
+       zip1    v18.2d, v6.2d, v1.2d
+       zip2    v1.4s, v1.4s, v6.4s
+       orr     v7.16b, v17.16b, v7.16b
+       mov     v18.s[3], v2.s[3]
+       zip1    v6.4s, v1.4s, v2.4s
+       add     v5.4s, v5.4s, v7.4s
+       zip1    v1.4s, v2.4s, v1.4s
+       eor     v3.16b, v5.16b, v3.16b
+       ext     v5.16b, v5.16b, v5.16b, #12
+       ext     v6.16b, v1.16b, v6.16b, #8
+       ushr    v17.4s, v3.4s, #8
+       shl     v3.4s, v3.4s, #24
+       orr     v17.16b, v3.16b, v17.16b
+       ext     v3.16b, v18.16b, v18.16b, #12
+       add     v4.4s, v17.4s, v4.4s
+       uzp1    v3.4s, v18.4s, v3.4s
+       ext     v17.16b, v17.16b, v17.16b, #8
+       eor     v7.16b, v4.16b, v7.16b
+       add     v5.4s, v5.4s, v3.4s
+       ext     v4.16b, v4.16b, v4.16b, #4
+       ushr    v18.4s, v7.4s, #7
+       shl     v7.4s, v7.4s, #25
+       orr     v7.16b, v7.16b, v18.16b
+       add     v5.4s, v5.4s, v7.4s
+       eor     v17.16b, v17.16b, v5.16b
+       add     v5.4s, v5.4s, v6.4s
+       rev32   v17.8h, v17.8h
+       add     v4.4s, v4.4s, v17.4s
+       eor     v2.16b, v4.16b, v7.16b
+       ext     v7.16b, v16.16b, v16.16b, #4
+       ushr    v1.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v2.16b, v1.16b
+       add     v2.4s, v5.4s, v1.4s
+       eor     v5.16b, v2.16b, v17.16b
+       uzp1    v17.4s, v7.4s, v7.4s
+       ushr    v16.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       orr     v5.16b, v5.16b, v16.16b
+       ext     v16.16b, v17.16b, v7.16b, #8
+       add     v4.4s, v5.4s, v4.4s
+       uzp2    v16.4s, v16.4s, v0.4s
+       ext     v5.16b, v5.16b, v5.16b, #8
+       eor     v1.16b, v4.16b, v1.16b
+       add     v2.4s, v2.4s, v16.4s
        ext     v4.16b, v4.16b, v4.16b, #12
-       add     v0.4s, v2.4s, v0.4s
-       rev32   v2.8h, v3.8h
-       add     v3.4s, v4.4s, v2.4s
-       eor     v4.16b, v3.16b, v5.16b
-       ushr    v5.4s, v4.4s, #12
+       ushr    v17.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ext     v2.16b, v2.16b, v2.16b, #4
+       orr     v1.16b, v1.16b, v17.16b
+       ext     v17.16b, v7.16b, v7.16b, #12
+       add     v2.4s, v2.4s, v1.4s
+       ext     v7.16b, v7.16b, v17.16b, #12
+       mov     v17.16b, v6.16b
+       eor     v5.16b, v5.16b, v2.16b
+       rev64   v7.4s, v7.4s
+       mov     v17.s[1], v3.s[2]
+       rev32   v5.8h, v5.8h
+       add     v4.4s, v4.4s, v5.4s
+       eor     v18.16b, v4.16b, v1.16b
+       trn2    v1.4s, v7.4s, v17.4s
+       ushr    v7.4s, v18.4s, #12
+       shl     v17.4s, v18.4s, #20
+       add     v2.4s, v2.4s, v1.4s
+       zip1    v18.2d, v6.2d, v0.2d
+       zip2    v0.4s, v0.4s, v6.4s
+       orr     v7.16b, v17.16b, v7.16b
+       mov     v18.s[3], v3.s[3]
+       add     v2.4s, v2.4s, v7.4s
+       eor     v5.16b, v2.16b, v5.16b
+       ext     v2.16b, v2.16b, v2.16b, #12
+       ushr    v17.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       orr     v5.16b, v5.16b, v17.16b
+       add     v17.4s, v5.4s, v4.4s
+       ext     v4.16b, v18.16b, v18.16b, #12
+       ext     v5.16b, v5.16b, v5.16b, #8
+       eor     v7.16b, v17.16b, v7.16b
+       uzp1    v4.4s, v18.4s, v4.4s
+       ext     v17.16b, v17.16b, v17.16b, #4
+       ushr    v18.4s, v7.4s, #7
+       shl     v7.4s, v7.4s, #25
+       add     v2.4s, v2.4s, v4.4s
+       orr     v7.16b, v7.16b, v18.16b
+       add     v2.4s, v2.4s, v7.4s
+       eor     v5.16b, v5.16b, v2.16b
+       rev32   v5.8h, v5.8h
+       add     v6.4s, v17.4s, v5.4s
+       zip1    v17.4s, v0.4s, v3.4s
+       zip1    v0.4s, v3.4s, v0.4s
+       eor     v3.16b, v6.16b, v7.16b
+       ext     v0.16b, v0.16b, v17.16b, #8
+       ushr    v7.4s, v3.4s, #12
+       shl     v3.4s, v3.4s, #20
+       add     v2.4s, v2.4s, v0.4s
+       orr     v3.16b, v3.16b, v7.16b
+       ext     v7.16b, v16.16b, v16.16b, #4
+       add     v2.4s, v2.4s, v3.4s
+       uzp1    v17.4s, v7.4s, v7.4s
+       eor     v5.16b, v2.16b, v5.16b
+       ushr    v16.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       orr     v5.16b, v5.16b, v16.16b
+       ext     v16.16b, v17.16b, v7.16b, #8
+       add     v6.4s, v5.4s, v6.4s
+       uzp2    v16.4s, v16.4s, v1.4s
+       ext     v5.16b, v5.16b, v5.16b, #8
+       eor     v3.16b, v6.16b, v3.16b
+       add     v2.4s, v2.4s, v16.4s
+       ext     v6.16b, v6.16b, v6.16b, #12
+       ushr    v17.4s, v3.4s, #7
+       shl     v3.4s, v3.4s, #25
+       ext     v2.16b, v2.16b, v2.16b, #4
+       orr     v3.16b, v3.16b, v17.16b
+       add     v17.4s, v2.4s, v3.4s
+       eor     v2.16b, v5.16b, v17.16b
+       ext     v5.16b, v7.16b, v7.16b, #12
+       rev32   v18.8h, v2.8h
+       ext     v2.16b, v7.16b, v5.16b, #12
+       mov     v5.16b, v0.16b
+       add     v6.4s, v6.4s, v18.4s
+       rev64   v2.4s, v2.4s
+       mov     v5.s[1], v4.s[2]
+       eor     v3.16b, v6.16b, v3.16b
+       trn2    v2.4s, v2.4s, v5.4s
+       ushr    v5.4s, v3.4s, #12
+       shl     v3.4s, v3.4s, #20
+       add     v7.4s, v17.4s, v2.4s
+       orr     v3.16b, v3.16b, v5.16b
+       add     v5.4s, v7.4s, v3.4s
+       eor     v7.16b, v5.16b, v18.16b
+       zip1    v18.2d, v0.2d, v1.2d
+       ext     v5.16b, v5.16b, v5.16b, #12
+       zip2    v0.4s, v1.4s, v0.4s
+       ushr    v17.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       mov     v18.s[3], v4.s[3]
+       orr     v7.16b, v7.16b, v17.16b
+       ext     v17.16b, v18.16b, v18.16b, #12
+       add     v6.4s, v7.4s, v6.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v19.16b, v6.16b, v3.16b
+       uzp1    v3.4s, v18.4s, v17.4s
+       ext     v6.16b, v6.16b, v6.16b, #4
+       ushr    v17.4s, v19.4s, #7
+       shl     v18.4s, v19.4s, #25
+       add     v5.4s, v5.4s, v3.4s
+       orr     v17.16b, v18.16b, v17.16b
+       add     v5.4s, v5.4s, v17.4s
+       eor     v7.16b, v7.16b, v5.16b
+       rev32   v7.8h, v7.8h
+       add     v1.4s, v6.4s, v7.4s
+       zip1    v6.4s, v0.4s, v4.4s
+       zip1    v0.4s, v4.4s, v0.4s
+       eor     v4.16b, v1.16b, v17.16b
+       ext     v6.16b, v0.16b, v6.16b, #8
+       ushr    v0.4s, v4.4s, #12
        shl     v4.4s, v4.4s, #20
-       orr     v4.16b, v4.16b, v5.16b
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ushr    v5.4s, v2.4s, #8
-       shl     v2.4s, v2.4s, #24
-       orr     v2.16b, v2.16b, v5.16b
-       add     v3.4s, v2.4s, v3.4s
-       eor     v4.16b, v3.16b, v4.16b
-       ext     v0.16b, v0.16b, v0.16b, #12
-       ushr    v5.4s, v4.4s, #7
-       shl     v4.4s, v4.4s, #25
-       add     v0.4s, v0.4s, v21.4s
-       orr     v4.16b, v4.16b, v5.16b
-       ext     v2.16b, v2.16b, v2.16b, #8
-       add     v0.4s, v0.4s, v4.4s
-       eor     v2.16b, v0.16b, v2.16b
-       ext     v3.16b, v3.16b, v3.16b, #4
-       add     v0.4s, v0.4s, v1.4s
-       rev32   v1.8h, v2.8h
-       add     v2.4s, v3.4s, v1.4s
-       eor     v3.16b, v2.16b, v4.16b
-       ushr    v4.4s, v3.4s, #12
+       add     v5.4s, v5.4s, v6.4s
+       zip1    v20.2d, v6.2d, v2.2d
+       orr     v0.16b, v4.16b, v0.16b
+       mov     v20.s[3], v3.s[3]
+       add     v4.4s, v5.4s, v0.4s
+       eor     v5.16b, v4.16b, v7.16b
+       ext     v7.16b, v16.16b, v16.16b, #4
+       ushr    v16.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       uzp1    v17.4s, v7.4s, v7.4s
+       orr     v5.16b, v5.16b, v16.16b
+       ext     v16.16b, v17.16b, v7.16b, #8
+       add     v1.4s, v5.4s, v1.4s
+       uzp2    v16.4s, v16.4s, v2.4s
+       zip2    v2.4s, v2.4s, v6.4s
+       eor     v0.16b, v1.16b, v0.16b
+       add     v4.4s, v4.4s, v16.4s
+       ext     v1.16b, v1.16b, v1.16b, #12
+       ext     v16.16b, v16.16b, v16.16b, #4
+       ushr    v17.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ext     v4.16b, v4.16b, v4.16b, #4
+       orr     v17.16b, v0.16b, v17.16b
+       ext     v0.16b, v5.16b, v5.16b, #8
+       ext     v5.16b, v7.16b, v7.16b, #12
+       add     v4.4s, v4.4s, v17.4s
+       eor     v0.16b, v0.16b, v4.16b
+       rev32   v18.8h, v0.8h
+       ext     v0.16b, v7.16b, v5.16b, #12
+       mov     v5.16b, v6.16b
+       add     v7.4s, v1.4s, v18.4s
+       rev64   v1.4s, v0.4s
+       mov     v5.s[1], v3.s[2]
+       eor     v17.16b, v7.16b, v17.16b
+       trn2    v1.4s, v1.4s, v5.4s
+       ushr    v19.4s, v17.4s, #12
+       shl     v17.4s, v17.4s, #20
+       add     v4.4s, v4.4s, v1.4s
+       orr     v17.16b, v17.16b, v19.16b
+       add     v19.4s, v4.4s, v17.4s
+       eor     v4.16b, v19.16b, v18.16b
+       ext     v19.16b, v19.16b, v19.16b, #12
+       ushr    v18.4s, v4.4s, #8
+       shl     v4.4s, v4.4s, #24
+       orr     v18.16b, v4.16b, v18.16b
+       ext     v4.16b, v20.16b, v20.16b, #12
+       add     v7.4s, v18.4s, v7.4s
+       uzp1    v4.4s, v20.4s, v4.4s
+       ext     v18.16b, v18.16b, v18.16b, #8
+       eor     v17.16b, v7.16b, v17.16b
+       add     v19.4s, v19.4s, v4.4s
+       ext     v7.16b, v7.16b, v7.16b, #4
+       ushr    v20.4s, v17.4s, #7
+       shl     v17.4s, v17.4s, #25
+       orr     v17.16b, v17.16b, v20.16b
+       add     v19.4s, v19.4s, v17.4s
+       eor     v18.16b, v18.16b, v19.16b
+       rev32   v18.8h, v18.8h
+       add     v6.4s, v7.4s, v18.4s
+       zip1    v7.4s, v2.4s, v3.4s
+       zip1    v2.4s, v3.4s, v2.4s
+       eor     v3.16b, v6.16b, v17.16b
+       ext     v2.16b, v2.16b, v7.16b, #8
+       ushr    v7.4s, v3.4s, #12
        shl     v3.4s, v3.4s, #20
-       orr     v3.16b, v3.16b, v4.16b
-       add     v0.4s, v0.4s, v3.4s
-       eor     v1.16b, v0.16b, v1.16b
-       ushr    v4.4s, v1.4s, #8
-       shl     v1.4s, v1.4s, #24
-       orr     v1.16b, v1.16b, v4.16b
-       add     v2.4s, v1.4s, v2.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v4.4s, v3.4s, #7
+       add     v17.4s, v19.4s, v2.4s
+       zip1    v1.2d, v2.2d, v1.2d
+       zip2    v0.4s, v0.4s, v2.4s
+       orr     v3.16b, v3.16b, v7.16b
+       mov     v1.s[3], v4.s[3]
+       add     v7.4s, v17.4s, v3.4s
+       eor     v17.16b, v7.16b, v18.16b
+       ext     v7.16b, v7.16b, v7.16b, #4
+       ushr    v18.4s, v17.4s, #8
+       shl     v17.4s, v17.4s, #24
+       orr     v17.16b, v17.16b, v18.16b
+       ext     v18.16b, v16.16b, v16.16b, #8
+       add     v6.4s, v17.4s, v6.4s
+       uzp2    v5.4s, v18.4s, v5.4s
+       eor     v3.16b, v6.16b, v3.16b
+       ext     v5.16b, v5.16b, v18.16b, #4
+       ext     v6.16b, v6.16b, v6.16b, #12
+       ushr    v18.4s, v3.4s, #7
+       shl     v3.4s, v3.4s, #25
+       add     v5.4s, v7.4s, v5.4s
+       ext     v7.16b, v17.16b, v17.16b, #8
+       ext     v17.16b, v16.16b, v16.16b, #12
+       orr     v3.16b, v3.16b, v18.16b
+       ext     v16.16b, v16.16b, v17.16b, #12
+       add     v5.4s, v3.4s, v5.4s
+       mov     v17.16b, v2.16b
+       rev64   v16.4s, v16.4s
+       eor     v7.16b, v7.16b, v5.16b
+       mov     v17.s[1], v4.s[2]
+       rev32   v7.8h, v7.8h
+       trn2    v16.4s, v16.4s, v17.4s
+       add     v6.4s, v6.4s, v7.4s
+       add     v5.4s, v5.4s, v16.4s
+       eor     v3.16b, v6.16b, v3.16b
+       ushr    v17.4s, v3.4s, #12
+       shl     v3.4s, v3.4s, #20
+       orr     v3.16b, v3.16b, v17.16b
+       add     v5.4s, v5.4s, v3.4s
+       eor     v7.16b, v5.16b, v7.16b
+       ext     v5.16b, v5.16b, v5.16b, #12
+       ushr    v16.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       orr     v7.16b, v7.16b, v16.16b
+       ext     v16.16b, v1.16b, v1.16b, #12
+       add     v6.4s, v7.4s, v6.4s
+       uzp1    v1.4s, v1.4s, v16.4s
+       eor     v3.16b, v6.16b, v3.16b
+       add     v1.4s, v5.4s, v1.4s
+       ext     v5.16b, v7.16b, v7.16b, #8
+       ext     v6.16b, v6.16b, v6.16b, #4
+       ushr    v16.4s, v3.4s, #7
        shl     v3.4s, v3.4s, #25
+       orr     v3.16b, v3.16b, v16.16b
+       add     v1.4s, v1.4s, v3.4s
+       eor     v5.16b, v5.16b, v1.16b
+       rev32   v5.8h, v5.8h
+       add     v2.4s, v6.4s, v5.4s
+       zip1    v6.4s, v0.4s, v4.4s
+       zip1    v0.4s, v4.4s, v0.4s
+       eor     v3.16b, v2.16b, v3.16b
+       ext     v0.16b, v0.16b, v6.16b, #8
+       ushr    v4.4s, v3.4s, #12
+       shl     v3.4s, v3.4s, #20
+       add     v0.4s, v1.4s, v0.4s
+       orr     v1.16b, v3.16b, v4.16b
+       add     v0.4s, v0.4s, v1.4s
+       eor     v3.16b, v0.16b, v5.16b
        ext     v0.16b, v0.16b, v0.16b, #4
-       ext     v1.16b, v1.16b, v1.16b, #8
-       ext     v2.16b, v2.16b, v2.16b, #12
+       ushr    v4.4s, v3.4s, #8
+       shl     v3.4s, v3.4s, #24
        orr     v3.16b, v3.16b, v4.16b
+       add     v2.4s, v3.4s, v2.4s
+       ext     v3.16b, v3.16b, v3.16b, #8
+       eor     v1.16b, v2.16b, v1.16b
+       ext     v2.16b, v2.16b, v2.16b, #12
+       ushr    v4.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       stp     q2, q3, [x0, #32]
+       orr     v1.16b, v1.16b, v4.16b
+       stp     q0, q1, [x0]
+       ret
+.Lfunc_end1:
+       .size   compress_pre, .Lfunc_end1-compress_pre
+       .cfi_endproc
+
+       .globl  zfs_blake3_compress_xof_sse2
+       .p2align        2
+       .type   zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+       .cfi_startproc
+       hint    #25
+       .cfi_negate_ra_state
+       sub     sp, sp, #96
+       stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
+       stp     x20, x19, [sp, #80]
+       .cfi_def_cfa w29, 32
+       .cfi_offset w19, -8
+       .cfi_offset w20, -16
+       .cfi_offset w30, -24
+       .cfi_offset w29, -32
+       mov     x20, x0
+       mov     x19, x5
+       mov     w5, w4
+       mov     x4, x3
+       mov     w3, w2
+       mov     x2, x1
+       mov     x0, sp
+       mov     x1, x20
+       bl      compress_pre
+       ldp     q0, q1, [sp]
+       ldp     q2, q3, [sp, #32]
        eor     v0.16b, v2.16b, v0.16b
-       eor     v3.16b, v3.16b, v1.16b
-       stp     q0, q3, [x5]
-       ldr     q0, [x0]
+       eor     v1.16b, v3.16b, v1.16b
+       ldp     x29, x30, [sp, #64]
+       stp     q0, q1, [x19]
+       ldr     q0, [x20]
        eor     v0.16b, v0.16b, v2.16b
-       str     q0, [x5, #32]
-       ldr     q0, [x0, #16]
-       eor     v0.16b, v0.16b, v1.16b
-       str     q0, [x5, #48]
+       str     q0, [x19, #32]
+       ldr     q0, [x20, #16]
+       eor     v0.16b, v0.16b, v3.16b
+       str     q0, [x19, #48]
+       ldp     x20, x19, [sp, #80]
+       add     sp, sp, #96
+       hint    #29
        ret
-.Lfunc_end1:
-       .size   zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+.Lfunc_end2:
+       .size   zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2
        .cfi_endproc
 
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4
-.LCPI2_0:
+.LCPI3_0:
        .word   0
        .word   1
        .word   2
@@ -991,19 +607,21 @@ zfs_blake3_compress_xof_sse2:
        .type   zfs_blake3_hash_many_sse2,@function
 zfs_blake3_hash_many_sse2:
        .cfi_startproc
+       hint    #25
+       .cfi_negate_ra_state
        stp     d15, d14, [sp, #-160]!
        stp     d13, d12, [sp, #16]
        stp     d11, d10, [sp, #32]
        stp     d9, d8, [sp, #48]
        stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
        stp     x28, x27, [sp, #80]
        stp     x26, x25, [sp, #96]
        stp     x24, x23, [sp, #112]
        stp     x22, x21, [sp, #128]
        stp     x20, x19, [sp, #144]
-       mov     x29, sp
-       sub     sp, sp, #384
-       .cfi_def_cfa w29, 160
+       sub     sp, sp, #464
+       .cfi_def_cfa w29, 96
        .cfi_offset w19, -8
        .cfi_offset w20, -16
        .cfi_offset w21, -24
@@ -1024,1414 +642,1406 @@ zfs_blake3_hash_many_sse2:
        .cfi_offset b13, -144
        .cfi_offset b14, -152
        .cfi_offset b15, -160
-       ldr     x26, [x29, #168]
-       ldrb    w27, [x29, #160]
        mov     w19, w6
        mov     x20, x4
-       mov     x22, x2
-       mov     x28, x1
+       mov     x24, x1
+       ldr     x26, [x29, #104]
+       ldrb    w27, [x29, #96]
        cmp     x1, #4
-       mov     x24, x0
        str     x3, [sp, #40]
-       b.lo    .LBB2_8
-       adrp    x9, .LCPI2_0
-       ldr     q0, [x9, :lo12:.LCPI2_0]
-       sbfx    w11, w5, #0, #1
-       dup     v1.4s, w11
-       mov     w9, #58983
+       b.lo    .LBB3_6
+       adrp    x8, .LCPI3_0
+       sbfx    w9, w5, #0, #1
        mov     w10, #44677
-       and     v0.16b, v1.16b, v0.16b
        mov     w11, #62322
-       mov     w12, #62778
-       orr     w8, w7, w19
-       movk    w9, #27145, lsl #16
        movk    w10, #47975, lsl #16
        movk    w11, #15470, lsl #16
+       ldr     q0, [x8, :lo12:.LCPI3_0]
+       dup     v1.4s, w9
+       mov     w9, #58983
+       orr     w8, w7, w19
+       movk    w9, #27145, lsl #16
+       and     v0.16b, v1.16b, v0.16b
+       dup     v1.4s, w11
+       movi    v24.4s, #64
+       dup     v2.4s, w9
+       mov     w9, #62778
+       movk    w9, #42319, lsl #16
        str     q0, [sp, #16]
        orr     v0.4s, #128, lsl #24
-       movk    w12, #42319, lsl #16
+       stp     q2, q1, [sp, #48]
        str     q0, [sp]
-.LBB2_2:
-       ldr     x0, [sp, #40]
-       mov     x13, x0
-       ld1r    { v20.4s }, [x13], #4
-       add     x14, x0, #8
-       add     x15, x0, #12
-       add     x16, x0, #16
-       add     x17, x0, #20
-       add     x18, x0, #24
-       add     x0, x0, #28
-       ld1r    { v17.4s }, [x14]
-       ld1r    { v6.4s }, [x15]
-       ld1r    { v8.4s }, [x16]
-       ld1r    { v9.4s }, [x17]
-       ld1r    { v31.4s }, [x18]
-       ld1r    { v26.4s }, [x13]
-       ld1r    { v15.4s }, [x0]
-       cbz     x22, .LBB2_7
+       dup     v0.4s, w10
+       str     q0, [sp, #80]
+       b       .LBB3_3
+.LBB3_2:
+       zip1    v0.4s, v12.4s, v31.4s
+       add     x10, x20, #4
+       zip1    v1.4s, v29.4s, v30.4s
+       tst     w5, #0x1
+       zip1    v2.4s, v28.4s, v23.4s
+       csel    x20, x10, x20, ne
+       zip1    v3.4s, v13.4s, v25.4s
+       add     x0, x0, #32
+       zip2    v6.4s, v12.4s, v31.4s
+       sub     x24, x24, #4
+       zip1    v4.2d, v0.2d, v1.2d
+       cmp     x24, #3
+       zip2    v7.4s, v29.4s, v30.4s
+       zip1    v5.2d, v2.2d, v3.2d
+       zip2    v0.2d, v0.2d, v1.2d
+       zip2    v1.2d, v2.2d, v3.2d
+       zip2    v2.4s, v28.4s, v23.4s
+       zip2    v3.4s, v13.4s, v25.4s
+       stp     q4, q5, [x26]
+       zip2    v4.2d, v6.2d, v7.2d
+       stp     q0, q1, [x26, #32]
+       zip1    v0.2d, v6.2d, v7.2d
+       zip1    v1.2d, v2.2d, v3.2d
+       zip2    v2.2d, v2.2d, v3.2d
+       stp     q0, q1, [x26, #64]
+       stp     q4, q2, [x26, #96]
+       add     x26, x26, #128
+       b.ls    .LBB3_6
+.LBB3_3:
+       ldr     x14, [sp, #40]
+       mov     x10, x14
+       add     x11, x14, #8
+       add     x12, x14, #12
+       add     x13, x14, #16
+       ld1r    { v12.4s }, [x10], #4
+       ld1r    { v29.4s }, [x11]
+       add     x11, x14, #20
+       ld1r    { v30.4s }, [x12]
+       add     x12, x14, #24
+       ld1r    { v28.4s }, [x13]
+       ld1r    { v23.4s }, [x11]
+       add     x11, x14, #28
+       ld1r    { v13.4s }, [x12]
+       ld1r    { v31.4s }, [x10]
+       ld1r    { v25.4s }, [x11]
+       cbz     x2, .LBB3_2
        ldr     q1, [sp, #16]
        dup     v0.4s, w20
-       ldp     x13, x14, [x24]
-       ldp     x15, x16, [x24, #16]
+       lsr     x12, x20, #32
+       mov     x10, xzr
+       ldp     x13, x14, [x0, #16]
        add     v1.4s, v0.4s, v1.4s
+       mov     x15, x2
        movi    v0.4s, #128, lsl #24
-       str     q1, [sp, #64]
+       mov     w4, w8
+       str     q1, [sp, #112]
        eor     v0.16b, v1.16b, v0.16b
        ldr     q1, [sp]
-       lsr     x18, x20, #32
-       mov     x17, xzr
        cmgt    v0.4s, v1.4s, v0.4s
-       dup     v1.4s, w18
+       dup     v1.4s, w12
+       ldp     x11, x12, [x0]
        sub     v0.4s, v1.4s, v0.4s
-       mov     w18, w8
-       str     q0, [sp, #48]
-.LBB2_4:
-       mov     w2, #16
-       bfi     x2, x17, #6, #58
-       ldr     q1, [x13, x2]
-       ldr     q3, [x14, x2]
-       ldr     q2, [x15, x2]
-       ldr     q4, [x16, x2]
-       mov     w2, #32
-       bfi     x2, x17, #6, #58
-       ldr     q5, [x13, x2]
-       ldr     q18, [x14, x2]
-       ldr     q19, [x15, x2]
-       ldr     q23, [x16, x2]
-       mov     w2, #48
-       lsl     x3, x17, #6
-       bfi     x2, x17, #6, #58
-       add     x17, x17, #1
-       ldr     q0, [x13, x3]
-       ldr     q21, [x14, x3]
-       ldr     q7, [x15, x3]
-       ldr     q16, [x16, x3]
-       cmp     x17, x22
-       ldr     q13, [x13, x2]
-       ldr     q14, [x14, x2]
-       ldr     q29, [x15, x2]
-       ldr     q10, [x16, x2]
-       csel    w2, w27, wzr, eq
-       orr     w18, w2, w18
-       mov     x0, xzr
-       and     w18, w18, #0xff
-       add     x3, x3, #256
-.LBB2_5:
-       ldr     x2, [x24, x0]
-       add     x0, x0, #8
-       cmp     x0, #32
-       add     x2, x2, x3
-       prfm    pldl1keep, [x2]
-       b.ne    .LBB2_5
-       dup     v22.4s, w18
-       str     q22, [sp, #192]
-       zip1    v27.4s, v0.4s, v21.4s
-       zip2    v21.4s, v0.4s, v21.4s
-       zip1    v0.4s, v7.4s, v16.4s
-       zip2    v22.4s, v7.4s, v16.4s
-       zip1    v7.4s, v1.4s, v3.4s
-       zip1    v25.4s, v2.4s, v4.4s
-       zip2    v16.4s, v2.4s, v4.4s
-       zip1    v11.4s, v19.4s, v23.4s
-       zip2    v12.4s, v19.4s, v23.4s
-       zip1    v19.4s, v13.4s, v14.4s
-       zip2    v23.4s, v13.4s, v14.4s
-       zip1    v13.4s, v29.4s, v10.4s
-       zip2    v14.4s, v29.4s, v10.4s
-       add     v10.4s, v20.4s, v8.4s
-       add     v2.4s, v26.4s, v9.4s
-       ext     v20.16b, v22.16b, v21.16b, #8
-       ext     v26.16b, v25.16b, v7.16b, #8
-       zip2    v24.4s, v1.4s, v3.4s
-       add     v1.4s, v6.4s, v15.4s
-       ext     v6.16b, v0.16b, v27.16b, #8
-       ext     v20.16b, v21.16b, v20.16b, #8
-       mov     v21.d[1], v22.d[0]
-       ext     v22.16b, v7.16b, v26.16b, #8
-       mov     v7.d[1], v25.d[0]
-       add     v3.4s, v17.4s, v31.4s
-       str     q1, [sp, #144]
-       ext     v1.16b, v27.16b, v6.16b, #8
-       mov     v6.16b, v7.16b
-       zip1    v28.4s, v5.4s, v18.4s
-       stur    q1, [x29, #-80]
-       mov     v1.16b, v27.16b
-       mov     v27.16b, v24.16b
-       add     v3.4s, v3.4s, v6.4s
-       ldr     q6, [sp, #64]
-       ext     v29.16b, v16.16b, v24.16b, #8
-       mov     v1.d[1], v0.d[0]
-       ext     v0.16b, v11.16b, v28.16b, #8
-       mov     v27.d[1], v16.d[0]
-       ext     v16.16b, v14.16b, v23.16b, #8
-       stur    q7, [x29, #-144]
-       ext     v7.16b, v24.16b, v29.16b, #8
-       ext     v29.16b, v28.16b, v0.16b, #8
-       ext     v0.16b, v23.16b, v16.16b, #8
-       mov     v23.d[1], v14.d[0]
-       stp     q0, q23, [sp, #80]
-       add     v0.4s, v10.4s, v1.4s
-       eor     v16.16b, v0.16b, v6.16b
-       ldr     q6, [sp, #48]
-       add     v2.4s, v2.4s, v21.4s
-       mov     v28.d[1], v11.d[0]
-       zip2    v18.4s, v5.4s, v18.4s
-       eor     v10.16b, v2.16b, v6.16b
-       movi    v6.4s, #64
-       eor     v11.16b, v3.16b, v6.16b
-       ldr     q6, [sp, #144]
-       dup     v17.4s, w9
-       ext     v30.16b, v12.16b, v18.16b, #8
-       rev32   v16.8h, v16.8h
-       dup     v5.4s, w10
-       ext     v25.16b, v18.16b, v30.16b, #8
-       mov     v30.16b, v23.16b
-       mov     v23.16b, v1.16b
-       str     q1, [sp, #160]
-       rev32   v10.8h, v10.8h
-       add     v1.4s, v16.4s, v17.4s
-       add     v17.4s, v6.4s, v27.4s
-       ldr     q6, [sp, #192]
-       dup     v4.4s, w11
-       rev32   v11.8h, v11.8h
-       add     v5.4s, v10.4s, v5.4s
-       eor     v8.16b, v1.16b, v8.16b
-       stur    q21, [x29, #-128]
-       mov     v18.d[1], v12.d[0]
-       add     v4.4s, v11.4s, v4.4s
-       eor     v9.16b, v5.16b, v9.16b
-       ushr    v12.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       ldur    q21, [x29, #-80]
-       ext     v26.16b, v13.16b, v19.16b, #8
-       eor     v31.16b, v4.16b, v31.16b
-       orr     v8.16b, v8.16b, v12.16b
-       ushr    v12.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       ext     v26.16b, v19.16b, v26.16b, #8
-       mov     v19.d[1], v13.d[0]
-       orr     v9.16b, v9.16b, v12.16b
-       ushr    v12.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v17.16b, v6.16b
-       orr     v31.16b, v31.16b, v12.16b
-       dup     v12.4s, w12
-       rev32   v13.8h, v13.8h
-       add     v12.4s, v13.4s, v12.4s
-       add     v0.4s, v0.4s, v21.4s
-       eor     v14.16b, v12.16b, v15.16b
-       add     v0.4s, v0.4s, v8.4s
-       add     v2.4s, v2.4s, v20.4s
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v16.16b, v0.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v22.4s
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v7.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v14.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v13.16b, v17.16b, v13.16b
-       add     v1.4s, v16.4s, v1.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v13.4s, #8
-       shl     v13.4s, v13.4s, #24
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v10.4s, v5.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v13.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v14.16b, v12.16b, v14.16b
-       add     v0.4s, v0.4s, v28.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #7
-       shl     v14.4s, v14.4s, #25
-       add     v0.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v18.4s
-       orr     v14.16b, v14.16b, v15.16b
-       eor     v13.16b, v0.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v19.4s
-       rev32   v13.8h, v13.8h
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v30.4s
-       add     v4.4s, v4.4s, v13.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v12.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v17.16b, v11.16b
-       mov     v24.16b, v7.16b
-       stur    q7, [x29, #-112]
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v1.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       mov     v7.16b, v26.16b
-       add     v3.4s, v3.4s, v26.4s
-       ldr     q26, [sp, #80]
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v14.16b, v1.16b, v14.16b
-       add     v5.4s, v5.4s, v11.4s
-       add     v0.4s, v0.4s, v29.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       add     v0.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v25.4s
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v13.16b, v0.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v13.4s, #8
-       shl     v13.4s, v13.4s, #24
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v26.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v17.16b, v11.16b
-       add     v4.4s, v13.4s, v4.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v16.4s, v12.4s
-       str     q22, [sp, #128]
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v10.4s, v1.4s
-       ldur    q22, [x29, #-128]
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v14.16b, v1.16b, v14.16b
-       add     v5.4s, v11.4s, v5.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #7
-       shl     v14.4s, v14.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       mov     v6.16b, v18.16b
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       ldur    q18, [x29, #-144]
-       orr     v8.16b, v8.16b, v15.16b
-       add     v0.4s, v0.4s, v22.4s
-       add     v0.4s, v0.4s, v8.4s
-       add     v2.4s, v2.4s, v20.4s
-       eor     v16.16b, v0.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v24.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v18.4s
-       add     v1.4s, v1.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v14.4s
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v5.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       eor     v13.16b, v17.16b, v13.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v4.4s, v11.4s
-       rev32   v13.8h, v13.8h
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v12.4s, v13.4s
-       add     v0.4s, v0.4s, v27.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v14.16b, v12.16b, v14.16b
-       add     v0.4s, v0.4s, v8.4s
-       add     v2.4s, v2.4s, v6.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v16.16b, v0.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v23.4s
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v7.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v14.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v13.16b, v17.16b, v13.16b
-       add     v1.4s, v16.4s, v1.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v13.4s, #8
-       shl     v13.4s, v13.4s, #24
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v10.4s, v5.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v13.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v14.16b, v12.16b, v14.16b
-       add     v0.4s, v0.4s, v21.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #7
-       shl     v14.4s, v14.4s, #25
-       add     v0.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v19.4s
-       orr     v14.16b, v14.16b, v15.16b
-       eor     v13.16b, v0.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v29.4s
-       str     q28, [sp, #112]
-       rev32   v13.8h, v13.8h
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v26.4s
-       add     v4.4s, v4.4s, v13.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       ldp     q28, q23, [sp, #112]
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v12.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v17.16b, v11.16b
-       ldr     q21, [sp, #96]
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v1.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v14.16b, v1.16b, v14.16b
-       add     v5.4s, v5.4s, v11.4s
-       add     v0.4s, v0.4s, v25.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       add     v0.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v23.4s
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v13.16b, v0.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v21.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v13.4s, #8
-       shl     v13.4s, v13.4s, #24
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v28.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v17.16b, v11.16b
-       add     v4.4s, v13.4s, v4.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v16.4s, v12.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v10.4s, v1.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v14.16b, v1.16b, v14.16b
-       add     v5.4s, v11.4s, v5.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #7
-       shl     v14.4s, v14.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       mov     v30.16b, v29.16b
-       mov     v29.16b, v25.16b
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       ldur    q25, [x29, #-112]
-       orr     v8.16b, v8.16b, v15.16b
-       add     v0.4s, v0.4s, v20.4s
-       add     v0.4s, v0.4s, v8.4s
-       add     v2.4s, v2.4s, v6.4s
-       eor     v16.16b, v0.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v7.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v25.4s
-       add     v1.4s, v1.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v14.4s
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v5.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       eor     v13.16b, v17.16b, v13.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v4.4s, v11.4s
-       rev32   v13.8h, v13.8h
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v12.4s, v13.4s
-       add     v0.4s, v0.4s, v18.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v14.16b, v12.16b, v14.16b
-       add     v0.4s, v0.4s, v8.4s
-       add     v2.4s, v2.4s, v19.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v16.16b, v0.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v22.4s
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v21.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v14.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v13.16b, v17.16b, v13.16b
-       add     v1.4s, v16.4s, v1.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v13.4s, #8
-       shl     v13.4s, v13.4s, #24
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v10.4s, v5.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v13.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v14.16b, v12.16b, v14.16b
+       str     q0, [sp, #96]
+.LBB3_5:
+       add     x17, x11, x10
+       add     x21, x12, x10
+       add     x16, x13, x10
+       add     x6, x14, x10
+       subs    x15, x15, #1
+       add     x10, x10, #64
+       ldp     q0, q1, [x17]
+       csel    w3, w27, wzr, eq
+       orr     w3, w3, w4
+       mov     w4, w19
+       and     w3, w3, #0xff
+       ldp     q3, q6, [x21]
+       dup     v2.4s, w3
+       zip1    v21.4s, v0.4s, v3.4s
+       zip2    v19.4s, v0.4s, v3.4s
+       ldp     q5, q7, [x16]
+       zip1    v17.4s, v1.4s, v6.4s
+       zip2    v22.4s, v1.4s, v6.4s
+       ldp     q16, q18, [x6]
+       zip1    v4.4s, v5.4s, v16.4s
+       zip2    v0.4s, v5.4s, v16.4s
+       ldp     q26, q27, [x17, #32]
+       zip1    v1.4s, v7.4s, v18.4s
+       zip2    v3.4s, v7.4s, v18.4s
+       zip2    v20.2d, v19.2d, v0.2d
+       mov     v19.d[1], v0.d[0]
+       dup     v18.4s, w9
+       ldp     q8, q9, [x21, #32]
+       stur    q19, [x29, #-208]
+       zip2    v7.4s, v26.4s, v8.4s
+       zip1    v10.4s, v26.4s, v8.4s
+       ldp     q11, q5, [x16, #32]
+       zip2    v26.2d, v17.2d, v1.2d
+       stp     q7, q26, [sp, #192]
+       mov     v17.d[1], v1.d[0]
+       add     v1.4s, v23.4s, v31.4s
+       ldp     q16, q6, [x6, #32]
+       stur    q17, [x29, #-256]
+       add     v1.4s, v1.4s, v19.4s
+       zip1    v8.4s, v11.4s, v16.4s
+       zip2    v7.4s, v11.4s, v16.4s
+       zip1    v11.4s, v27.4s, v9.4s
+       zip2    v9.4s, v27.4s, v9.4s
+       zip2    v27.2d, v21.2d, v4.2d
+       mov     v21.d[1], v4.d[0]
+       str     q7, [sp, #224]
+       add     v4.4s, v28.4s, v12.4s
+       zip1    v15.4s, v5.4s, v6.4s
+       zip2    v14.4s, v5.4s, v6.4s
+       stur    q27, [x29, #-192]
+       zip2    v16.2d, v22.2d, v3.2d
+       stp     q20, q21, [x29, #-240]
+       add     v0.4s, v4.4s, v21.4s
+       ldp     q6, q4, [sp, #96]
+       mov     v22.d[1], v3.d[0]
+       add     v5.4s, v25.4s, v30.4s
+       add     v3.4s, v13.4s, v29.4s
+       eor     v6.16b, v1.16b, v6.16b
+       add     v1.4s, v1.4s, v20.4s
+       str     q22, [sp, #256]
+       eor     v4.16b, v0.16b, v4.16b
+       add     v5.4s, v5.4s, v22.4s
+       add     v3.4s, v3.4s, v17.4s
+       ldr     q17, [sp, #48]
+       rev32   v6.8h, v6.8h
+       rev32   v4.8h, v4.8h
+       eor     v2.16b, v5.16b, v2.16b
+       eor     v7.16b, v3.16b, v24.16b
        add     v0.4s, v0.4s, v27.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #7
-       shl     v14.4s, v14.4s, #25
-       add     v0.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v30.4s
-       orr     v14.16b, v14.16b, v15.16b
-       eor     v13.16b, v0.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v29.4s
-       rev32   v13.8h, v13.8h
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v28.4s
-       add     v4.4s, v4.4s, v13.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v12.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v17.16b, v11.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v1.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       ldr     q24, [sp, #160]
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v14.16b, v1.16b, v14.16b
-       add     v5.4s, v5.4s, v11.4s
-       stur    q7, [x29, #-64]
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v14.4s, #12
-       shl     v14.4s, v14.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       mov     v7.16b, v26.16b
-       add     v3.4s, v3.4s, v26.4s
-       ldur    q26, [x29, #-80]
-       orr     v14.16b, v14.16b, v15.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       add     v0.4s, v0.4s, v23.4s
-       orr     v8.16b, v8.16b, v15.16b
-       add     v15.4s, v0.4s, v9.4s
-       add     v2.4s, v2.4s, v24.4s
-       eor     v0.16b, v15.16b, v13.16b
-       add     v2.4s, v2.4s, v31.4s
-       ushr    v13.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v14.4s
-       add     v17.4s, v17.4s, v26.4s
-       orr     v0.16b, v0.16b, v13.16b
-       ushr    v13.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       orr     v16.16b, v16.16b, v13.16b
-       ushr    v13.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v17.16b, v11.16b
-       add     v4.4s, v0.4s, v4.4s
-       orr     v10.16b, v10.16b, v13.16b
-       ushr    v13.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v16.4s, v12.4s
-       orr     v11.16b, v11.16b, v13.16b
-       ushr    v13.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v12.16b, v31.16b
-       orr     v9.16b, v9.16b, v13.16b
-       ushr    v13.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       add     v1.4s, v10.4s, v1.4s
-       orr     v31.16b, v31.16b, v13.16b
-       eor     v13.16b, v1.16b, v14.16b
-       add     v5.4s, v11.4s, v5.4s
-       ushr    v14.4s, v13.4s, #7
-       shl     v13.4s, v13.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       orr     v13.16b, v13.16b, v14.16b
-       ushr    v14.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       stur    q6, [x29, #-96]
-       orr     v8.16b, v8.16b, v14.16b
-       add     v14.4s, v15.4s, v6.4s
-       ldur    q6, [x29, #-64]
-       mov     v18.16b, v19.16b
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v18.4s
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v21.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v6.4s
-       add     v1.4s, v1.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v13.4s
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v5.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       eor     v0.16b, v17.16b, v0.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v4.4s, v11.4s
-       rev32   v0.8h, v0.8h
+       add     v21.4s, v4.4s, v17.4s
+       rev32   v31.8h, v2.8h
+       ldr     q2, [sp, #80]
+       rev32   v7.8h, v7.8h
+       mov     v27.16b, v16.16b
+       eor     v17.16b, v21.16b, v28.16b
+       add     v29.4s, v6.4s, v2.4s
+       ldr     q2, [sp, #64]
+       add     v24.4s, v31.4s, v18.4s
        str     q27, [sp, #176]
-       mov     v27.16b, v30.16b
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v12.4s, v0.4s
-       add     v14.4s, v14.4s, v25.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v12.16b, v13.16b
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v27.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #12
-       shl     v13.4s, v13.4s, #20
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v20.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v7.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v13.4s
-       mov     v30.16b, v23.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v0.16b, v17.16b, v0.16b
-       add     v1.4s, v16.4s, v1.4s
-       ldur    q23, [x29, #-144]
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v10.4s, v5.4s
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v0.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v13.16b, v12.16b, v13.16b
-       add     v14.4s, v14.4s, v23.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #7
-       shl     v13.4s, v13.4s, #25
-       add     v14.4s, v14.4s, v9.4s
-       add     v2.4s, v2.4s, v29.4s
-       orr     v13.16b, v13.16b, v15.16b
-       eor     v0.16b, v14.16b, v0.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v30.4s
-       rev32   v0.8h, v0.8h
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v13.4s
-       add     v17.4s, v17.4s, v26.4s
-       add     v4.4s, v4.4s, v0.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       ldur    q22, [x29, #-128]
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v12.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v17.16b, v11.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v1.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       ldr     q26, [sp, #176]
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v1.16b, v13.16b
-       add     v5.4s, v5.4s, v11.4s
-       add     v14.4s, v14.4s, v24.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #12
-       shl     v13.4s, v13.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       add     v14.4s, v14.4s, v9.4s
-       add     v2.4s, v2.4s, v22.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v0.16b, v14.16b, v0.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v28.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v13.4s
+       ushr    v19.4s, v17.4s, #12
+       shl     v17.4s, v17.4s, #20
+       add     v30.4s, v7.4s, v2.4s
+       eor     v18.16b, v29.16b, v23.16b
+       orr     v12.16b, v17.16b, v19.16b
+       eor     v17.16b, v30.16b, v13.16b
+       eor     v19.16b, v24.16b, v25.16b
+       ushr    v23.4s, v18.4s, #12
+       shl     v18.4s, v18.4s, #20
+       ushr    v25.4s, v17.4s, #12
+       shl     v17.4s, v17.4s, #20
+       ushr    v28.4s, v19.4s, #12
+       shl     v19.4s, v19.4s, #20
+       orr     v13.16b, v18.16b, v23.16b
+       orr     v25.16b, v17.16b, v25.16b
+       orr     v2.16b, v19.16b, v28.16b
+       add     v28.4s, v0.4s, v12.4s
+       add     v0.4s, v3.4s, v26.4s
+       add     v18.4s, v1.4s, v13.4s
+       add     v3.4s, v5.4s, v16.4s
+       eor     v1.16b, v28.16b, v4.16b
+       add     v17.4s, v0.4s, v25.4s
+       eor     v0.16b, v18.16b, v6.16b
+       add     v19.4s, v3.4s, v2.4s
+       ushr    v16.4s, v1.4s, #8
+       shl     v3.4s, v1.4s, #24
+       eor     v4.16b, v17.16b, v7.16b
+       ushr    v6.4s, v0.4s, #8
+       shl     v1.4s, v0.4s, #24
+       eor     v5.16b, v19.16b, v31.16b
+       ushr    v23.4s, v4.4s, #8
+       shl     v4.4s, v4.4s, #24
+       orr     v7.16b, v3.16b, v16.16b
+       orr     v6.16b, v1.16b, v6.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v0.4s, v5.4s, #24
+       orr     v5.16b, v4.16b, v23.16b
+       add     v4.4s, v7.4s, v21.4s
+       ldr     q21, [sp, #192]
+       add     v3.4s, v6.4s, v29.4s
+       orr     v31.16b, v0.16b, v31.16b
+       add     v23.4s, v5.4s, v30.4s
+       eor     v0.16b, v4.16b, v12.16b
+       eor     v1.16b, v3.16b, v13.16b
+       add     v16.4s, v31.4s, v24.4s
+       eor     v20.16b, v23.16b, v25.16b
+       ushr    v24.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v29.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ushr    v30.4s, v20.4s, #7
+       shl     v20.4s, v20.4s, #25
+       orr     v25.16b, v0.16b, v24.16b
+       orr     v0.16b, v1.16b, v29.16b
+       mov     v29.16b, v10.16b
+       orr     v1.16b, v20.16b, v30.16b
+       mov     v20.16b, v10.16b
+       mov     v24.16b, v21.16b
+       ldr     q20, [sp, #224]
+       mov     v29.d[1], v8.d[0]
+       mov     v13.16b, v9.16b
+       zip2    v30.2d, v10.2d, v8.2d
+       zip2    v8.2d, v21.2d, v20.2d
+       mov     v26.16b, v11.16b
+       mov     v24.d[1], v20.d[0]
+       add     v20.4s, v28.4s, v29.4s
+       mov     v13.d[1], v14.d[0]
+       str     q8, [sp, #128]
+       eor     v2.16b, v16.16b, v2.16b
+       mov     v26.d[1], v15.d[0]
+       str     q24, [sp, #192]
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v13.4s
+       ushr    v12.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       zip2    v10.2d, v9.2d, v14.2d
+       add     v18.4s, v18.4s, v24.4s
        add     v17.4s, v17.4s, v26.4s
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v17.16b, v11.16b
-       add     v4.4s, v0.4s, v4.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v16.4s, v12.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v10.4s, v1.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v13.16b, v1.16b, v13.16b
-       add     v5.4s, v11.4s, v5.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #7
-       shl     v13.4s, v13.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       orr     v8.16b, v8.16b, v15.16b
-       add     v14.4s, v14.4s, v18.4s
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v27.4s
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v7.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
+       mov     v14.16b, v26.16b
+       eor     v26.16b, v20.16b, v31.16b
+       stp     q10, q30, [sp, #224]
+       add     v19.4s, v19.4s, v25.4s
+       orr     v2.16b, v2.16b, v12.16b
+       add     v18.4s, v18.4s, v1.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       eor     v6.16b, v17.16b, v6.16b
+       rev32   v7.8h, v7.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       rev32   v6.8h, v6.8h
+       add     v16.4s, v16.4s, v7.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v4.4s, v6.4s
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       add     v20.4s, v20.4s, v30.4s
+       zip2    v21.2d, v11.2d, v15.2d
+       ushr    v11.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v10.4s
+       add     v20.4s, v20.4s, v0.4s
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v18.4s, v18.4s, v8.4s
+       add     v19.4s, v19.4s, v25.4s
+       eor     v26.16b, v20.16b, v26.16b
+       orr     v2.16b, v2.16b, v11.16b
        add     v17.4s, v17.4s, v21.4s
-       add     v1.4s, v1.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v3.16b, v11.16b
-       add     v17.4s, v17.4s, v13.4s
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v5.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       eor     v0.16b, v17.16b, v0.16b
-       add     v14.4s, v14.4s, v6.4s
-       ldur    q6, [x29, #-96]
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v4.4s, v11.4s
-       rev32   v0.8h, v0.8h
-       stur    q20, [x29, #-160]
-       mov     v20.16b, v29.16b
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v12.4s, v0.4s
-       mov     v19.16b, v29.16b
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v12.16b, v13.16b
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v20.4s
-       mov     v19.16b, v28.16b
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #12
-       shl     v13.4s, v13.4s, #20
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       ushr    v31.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v17.4s, v17.4s, v2.4s
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       orr     v26.16b, v26.16b, v31.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       eor     v0.16b, v23.16b, v0.16b
+       ldp     q28, q12, [x29, #-256]
+       orr     v6.16b, v6.16b, v11.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       add     v18.4s, v18.4s, v12.4s
+       mov     v15.16b, v29.16b
+       ldur    q29, [x29, #-208]
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       str     q15, [sp, #160]
+       add     v20.4s, v20.4s, v29.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v27.4s
+       eor     v6.16b, v6.16b, v18.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v28.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       rev32   v6.8h, v6.8h
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
        add     v3.4s, v3.4s, v6.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v19.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v3.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       eor     v0.16b, v3.16b, v0.16b
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       ushr    v11.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v18.4s, v18.4s, v24.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v20.4s, v20.4s, v22.4s
+       add     v18.4s, v18.4s, v0.4s
+       mov     v9.16b, v30.16b
+       mov     v30.16b, v21.16b
+       ldur    q21, [x29, #-224]
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       str     q30, [sp, #144]
+       add     v17.4s, v17.4s, v21.4s
+       ldur    q21, [x29, #-192]
+       eor     v6.16b, v18.16b, v6.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v30.4s
+       eor     v7.16b, v20.16b, v7.16b
+       add     v17.4s, v17.4s, v1.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v19.4s, v19.4s, v2.4s
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       orr     v7.16b, v7.16b, v31.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       add     v3.4s, v6.4s, v3.4s
+       ushr    v11.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v4.4s, v7.4s, v4.4s
+       orr     v5.16b, v5.16b, v31.16b
+       eor     v0.16b, v3.16b, v0.16b
+       orr     v26.16b, v26.16b, v11.16b
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v5.4s, v23.4s
+       ushr    v11.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v16.4s, v26.4s, v16.4s
+       ushr    v31.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v20.4s, v20.4s, v21.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v10.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v18.4s, v18.4s, v14.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v19.4s, v19.4s, v25.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v9.4s
+       ldr     q9, [sp, #208]
+       add     v18.4s, v18.4s, v1.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       eor     v6.16b, v17.16b, v6.16b
+       rev32   v7.8h, v7.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       rev32   v6.8h, v6.8h
+       add     v16.4s, v16.4s, v7.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v4.4s, v6.4s
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       add     v20.4s, v20.4s, v8.4s
+       ushr    v11.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v15.4s
+       add     v20.4s, v20.4s, v0.4s
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v18.4s, v18.4s, v9.4s
+       add     v19.4s, v19.4s, v25.4s
+       eor     v26.16b, v20.16b, v26.16b
+       orr     v2.16b, v2.16b, v11.16b
        add     v17.4s, v17.4s, v13.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v0.16b, v17.16b, v0.16b
-       add     v1.4s, v16.4s, v1.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v10.4s, v5.4s
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v0.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v13.16b, v12.16b, v13.16b
-       add     v14.4s, v14.4s, v25.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #7
-       shl     v13.4s, v13.4s, #25
-       add     v14.4s, v14.4s, v9.4s
-       add     v2.4s, v2.4s, v30.4s
-       orr     v13.16b, v13.16b, v15.16b
-       eor     v0.16b, v14.16b, v0.16b
-       add     v2.4s, v2.4s, v31.4s
-       add     v3.4s, v3.4s, v24.4s
-       rev32   v0.8h, v0.8h
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v13.4s
-       add     v17.4s, v17.4s, v26.4s
-       mov     v29.16b, v27.16b
-       add     v4.4s, v4.4s, v0.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       ldur    q27, [x29, #-160]
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v12.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v17.16b, v11.16b
-       ldur    q6, [x29, #-80]
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v1.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v1.16b, v13.16b
-       add     v5.4s, v5.4s, v11.4s
-       add     v14.4s, v14.4s, v22.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #12
-       shl     v13.4s, v13.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       add     v14.4s, v14.4s, v9.4s
-       add     v2.4s, v2.4s, v27.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v0.16b, v14.16b, v0.16b
-       add     v2.4s, v2.4s, v31.4s
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       ushr    v31.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v17.4s, v17.4s, v2.4s
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       orr     v26.16b, v26.16b, v31.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       eor     v0.16b, v23.16b, v0.16b
+       orr     v6.16b, v6.16b, v11.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       add     v18.4s, v18.4s, v24.4s
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v12.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v30.4s
+       eor     v6.16b, v6.16b, v18.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v27.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       rev32   v6.8h, v6.8h
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
        add     v3.4s, v3.4s, v6.4s
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       eor     v16.16b, v2.16b, v16.16b
-       add     v3.4s, v3.4s, v13.4s
-       add     v17.4s, v17.4s, v23.4s
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v3.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       eor     v11.16b, v17.16b, v11.16b
-       add     v4.4s, v0.4s, v4.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v11.4s, #8
-       shl     v11.4s, v11.4s, #24
-       eor     v9.16b, v4.16b, v9.16b
-       add     v12.4s, v16.4s, v12.4s
-       orr     v11.16b, v11.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v31.16b, v12.16b, v31.16b
-       add     v1.4s, v10.4s, v1.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       eor     v13.16b, v1.16b, v13.16b
-       add     v5.4s, v11.4s, v5.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #7
-       shl     v13.4s, v13.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       orr     v8.16b, v8.16b, v15.16b
-       add     v14.4s, v14.4s, v29.4s
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v20.4s
-       mov     v28.16b, v7.16b
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       add     v3.4s, v3.4s, v19.4s
-       rev32   v16.8h, v16.8h
-       eor     v10.16b, v2.16b, v10.16b
-       add     v3.4s, v3.4s, v31.4s
-       add     v17.4s, v17.4s, v28.4s
-       add     v1.4s, v1.4s, v16.4s
-       rev32   v10.8h, v10.8h
-       eor     v11.16b, v3.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       eor     v0.16b, v3.16b, v0.16b
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       ushr    v11.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v18.4s, v18.4s, v14.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v20.4s, v20.4s, v28.4s
+       add     v18.4s, v18.4s, v0.4s
+       mov     v10.16b, v13.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v29.4s
+       eor     v6.16b, v18.16b, v6.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v10.4s
+       eor     v7.16b, v20.16b, v7.16b
+       add     v17.4s, v17.4s, v1.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v19.4s, v19.4s, v2.4s
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       orr     v7.16b, v7.16b, v31.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       add     v3.4s, v6.4s, v3.4s
+       ushr    v11.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v4.4s, v7.4s, v4.4s
+       orr     v5.16b, v5.16b, v31.16b
+       eor     v0.16b, v3.16b, v0.16b
+       mov     v22.16b, v8.16b
+       ldp     q8, q28, [sp, #240]
+       orr     v26.16b, v26.16b, v11.16b
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v5.4s, v23.4s
+       ushr    v11.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v16.4s, v26.4s, v16.4s
+       ushr    v31.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v20.4s, v20.4s, v28.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v15.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v18.4s, v18.4s, v8.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v19.4s, v19.4s, v25.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v22.4s
+       ldur    q22, [x29, #-256]
+       add     v18.4s, v18.4s, v1.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       eor     v6.16b, v17.16b, v6.16b
+       rev32   v7.8h, v7.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       rev32   v6.8h, v6.8h
+       add     v16.4s, v16.4s, v7.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v4.4s, v6.4s
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       add     v20.4s, v20.4s, v9.4s
+       mov     v13.16b, v12.16b
+       mov     v12.16b, v27.16b
+       mov     v27.16b, v9.16b
+       ldur    q9, [x29, #-192]
+       mov     v21.16b, v15.16b
+       ldr     q15, [sp, #224]
+       ushr    v11.4s, v1.4s, #12
+       ldur    q21, [x29, #-224]
+       shl     v1.4s, v1.4s, #20
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v9.4s
+       add     v20.4s, v20.4s, v0.4s
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v18.4s, v18.4s, v21.4s
+       add     v19.4s, v19.4s, v25.4s
+       eor     v26.16b, v20.16b, v26.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v15.4s
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       ushr    v31.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v17.4s, v17.4s, v2.4s
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       orr     v26.16b, v26.16b, v31.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       eor     v0.16b, v23.16b, v0.16b
+       orr     v6.16b, v6.16b, v11.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       add     v18.4s, v18.4s, v14.4s
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v24.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v10.4s
+       eor     v6.16b, v6.16b, v18.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v30.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       rev32   v6.8h, v6.8h
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
+       add     v3.4s, v3.4s, v6.4s
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       eor     v0.16b, v3.16b, v0.16b
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       ushr    v11.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v18.4s, v18.4s, v8.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v20.4s, v20.4s, v12.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
        add     v17.4s, v17.4s, v13.4s
-       eor     v8.16b, v1.16b, v8.16b
-       add     v5.4s, v5.4s, v10.4s
-       rev32   v11.8h, v11.8h
-       eor     v0.16b, v17.16b, v0.16b
-       ushr    v15.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       eor     v9.16b, v5.16b, v9.16b
-       add     v4.4s, v4.4s, v11.4s
-       rev32   v0.8h, v0.8h
-       orr     v8.16b, v8.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v31.16b, v4.16b, v31.16b
-       add     v12.4s, v12.4s, v0.4s
-       add     v14.4s, v14.4s, v21.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       eor     v13.16b, v12.16b, v13.16b
-       add     v14.4s, v14.4s, v8.4s
-       add     v2.4s, v2.4s, v30.4s
-       orr     v31.16b, v31.16b, v15.16b
-       ushr    v15.4s, v13.4s, #12
-       shl     v13.4s, v13.4s, #20
-       eor     v16.16b, v14.16b, v16.16b
-       add     v2.4s, v2.4s, v9.4s
-       orr     v13.16b, v13.16b, v15.16b
-       ushr    v15.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v10.16b, v2.16b, v10.16b
-       orr     v16.16b, v16.16b, v15.16b
-       ushr    v15.4s, v10.4s, #8
-       shl     v10.4s, v10.4s, #24
-       add     v3.4s, v3.4s, v18.4s
-       orr     v10.16b, v10.16b, v15.16b
-       add     v15.4s, v3.4s, v31.4s
-       eor     v3.16b, v15.16b, v11.16b
-       ushr    v11.4s, v3.4s, #8
-       shl     v3.4s, v3.4s, #24
-       orr     v11.16b, v3.16b, v11.16b
-       add     v3.4s, v17.4s, v6.4s
-       add     v17.4s, v3.4s, v13.4s
-       eor     v0.16b, v17.16b, v0.16b
-       ushr    v3.4s, v0.4s, #8
-       shl     v0.4s, v0.4s, #24
-       add     v1.4s, v16.4s, v1.4s
-       orr     v0.16b, v0.16b, v3.16b
-       eor     v3.16b, v1.16b, v8.16b
-       ushr    v8.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       add     v5.4s, v10.4s, v5.4s
-       orr     v8.16b, v3.16b, v8.16b
-       eor     v3.16b, v5.16b, v9.16b
-       add     v4.4s, v11.4s, v4.4s
-       ushr    v9.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       eor     v31.16b, v4.16b, v31.16b
-       mov     v7.16b, v23.16b
-       mov     v23.16b, v28.16b
-       mov     v28.16b, v6.16b
-       orr     v3.16b, v3.16b, v9.16b
-       ushr    v9.4s, v31.4s, #7
-       shl     v31.4s, v31.4s, #25
-       ldur    q6, [x29, #-64]
-       orr     v31.16b, v31.16b, v9.16b
-       add     v9.4s, v0.4s, v12.4s
-       eor     v12.16b, v9.16b, v13.16b
-       ushr    v13.4s, v12.4s, #7
-       shl     v12.4s, v12.4s, #25
-       orr     v12.16b, v12.16b, v13.16b
-       add     v13.4s, v14.4s, v6.4s
-       add     v13.4s, v13.4s, v3.4s
-       eor     v0.16b, v13.16b, v0.16b
-       add     v2.4s, v2.4s, v24.4s
-       rev32   v14.8h, v0.8h
-       add     v0.4s, v2.4s, v31.4s
-       add     v6.4s, v4.4s, v14.4s
-       eor     v2.16b, v0.16b, v16.16b
-       eor     v3.16b, v6.16b, v3.16b
-       rev32   v16.8h, v2.8h
-       ushr    v4.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       add     v2.4s, v9.4s, v16.4s
-       orr     v4.16b, v3.16b, v4.16b
-       eor     v3.16b, v2.16b, v31.16b
-       ushr    v31.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       orr     v3.16b, v3.16b, v31.16b
-       add     v31.4s, v15.4s, v22.4s
-       add     v31.4s, v31.4s, v12.4s
-       add     v17.4s, v17.4s, v7.4s
-       eor     v9.16b, v31.16b, v10.16b
-       add     v17.4s, v17.4s, v8.4s
-       rev32   v9.8h, v9.8h
-       eor     v11.16b, v17.16b, v11.16b
-       add     v1.4s, v1.4s, v9.4s
-       rev32   v11.8h, v11.8h
-       eor     v10.16b, v1.16b, v12.16b
-       add     v5.4s, v5.4s, v11.4s
-       ushr    v12.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v8.16b, v5.16b, v8.16b
-       orr     v10.16b, v10.16b, v12.16b
-       ushr    v12.4s, v8.4s, #12
-       shl     v8.4s, v8.4s, #20
-       orr     v8.16b, v8.16b, v12.16b
-       add     v12.4s, v13.4s, v27.4s
-       add     v12.4s, v12.4s, v4.4s
-       eor     v13.16b, v12.16b, v14.16b
-       ldur    q14, [x29, #-96]
-       mov     v25.16b, v29.16b
-       add     v29.4s, v12.4s, v20.4s
-       add     v20.4s, v31.4s, v26.4s
-       add     v0.4s, v0.4s, v14.4s
-       add     v0.4s, v0.4s, v3.4s
-       eor     v16.16b, v0.16b, v16.16b
-       add     v0.4s, v0.4s, v30.4s
-       ldur    q30, [x29, #-112]
+       ldr     q13, [sp, #160]
+       eor     v6.16b, v18.16b, v6.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v15.4s
+       eor     v7.16b, v20.16b, v7.16b
+       add     v17.4s, v17.4s, v1.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v19.4s, v19.4s, v2.4s
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       orr     v7.16b, v7.16b, v31.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       add     v3.4s, v6.4s, v3.4s
+       ushr    v11.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v4.4s, v7.4s, v4.4s
+       orr     v5.16b, v5.16b, v31.16b
+       eor     v0.16b, v3.16b, v0.16b
+       orr     v26.16b, v26.16b, v11.16b
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v5.4s, v23.4s
+       ushr    v11.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v16.4s, v26.4s, v16.4s
+       ushr    v31.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v20.4s, v20.4s, v22.4s
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v9.4s
+       mov     v29.16b, v14.16b
+       ldr     q14, [sp, #128]
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v18.4s, v18.4s, v14.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v19.4s, v19.4s, v25.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v27.4s
+       add     v18.4s, v18.4s, v1.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       eor     v6.16b, v17.16b, v6.16b
+       rev32   v7.8h, v7.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       rev32   v6.8h, v6.8h
+       add     v16.4s, v16.4s, v7.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v4.4s, v6.4s
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       add     v20.4s, v20.4s, v21.4s
+       ushr    v11.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v28.4s
+       add     v20.4s, v20.4s, v0.4s
+       mov     v12.16b, v27.16b
+       ldur    q27, [x29, #-208]
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v18.4s, v18.4s, v27.4s
+       add     v19.4s, v19.4s, v25.4s
+       eor     v26.16b, v20.16b, v26.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v13.4s
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       ushr    v31.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v17.4s, v17.4s, v2.4s
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       orr     v26.16b, v26.16b, v31.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       eor     v0.16b, v23.16b, v0.16b
+       orr     v6.16b, v6.16b, v11.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       add     v18.4s, v18.4s, v8.4s
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v29.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v15.4s
+       eor     v6.16b, v6.16b, v18.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v10.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       rev32   v6.8h, v6.8h
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
+       add     v3.4s, v3.4s, v6.4s
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       eor     v0.16b, v3.16b, v0.16b
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       ushr    v11.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
+       add     v18.4s, v18.4s, v14.4s
+       mov     v30.16b, v29.16b
+       mov     v29.16b, v15.16b
+       ldr     q15, [sp, #144]
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v20.4s, v20.4s, v15.4s
+       add     v18.4s, v18.4s, v0.4s
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v24.4s
+       eor     v6.16b, v18.16b, v6.16b
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v13.4s
+       eor     v7.16b, v20.16b, v7.16b
+       add     v17.4s, v17.4s, v1.4s
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v19.4s, v19.4s, v2.4s
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v26.16b, v19.16b, v26.16b
+       orr     v7.16b, v7.16b, v31.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       add     v3.4s, v6.4s, v3.4s
+       ushr    v11.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v4.4s, v7.4s, v4.4s
+       orr     v5.16b, v5.16b, v31.16b
+       eor     v0.16b, v3.16b, v0.16b
+       orr     v26.16b, v26.16b, v11.16b
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v5.4s, v23.4s
+       ushr    v11.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       mov     v9.16b, v28.16b
+       mov     v28.16b, v10.16b
+       ldr     q10, [sp, #176]
+       add     v16.4s, v26.4s, v16.4s
+       ushr    v31.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v0.16b, v0.16b, v11.16b
        add     v20.4s, v20.4s, v10.4s
-       eor     v31.16b, v20.16b, v9.16b
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v9.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v18.4s, v18.4s, v12.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v19.4s, v19.4s, v25.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v17.4s, v17.4s, v21.4s
+       add     v18.4s, v18.4s, v1.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       eor     v6.16b, v17.16b, v6.16b
+       rev32   v7.8h, v7.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       rev32   v6.8h, v6.8h
+       add     v16.4s, v16.4s, v7.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v4.4s, v6.4s
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       ushr    v11.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       eor     v2.16b, v4.16b, v2.16b
+       add     v20.4s, v20.4s, v27.4s
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v22.4s
+       mov     v9.16b, v22.16b
+       ldur    q22, [x29, #-240]
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v20.4s, v20.4s, v0.4s
+       add     v18.4s, v18.4s, v22.4s
+       add     v19.4s, v19.4s, v25.4s
+       mov     v24.16b, v21.16b
+       ldur    q21, [x29, #-192]
+       orr     v2.16b, v2.16b, v11.16b
+       eor     v26.16b, v20.16b, v26.16b
+       add     v17.4s, v17.4s, v21.4s
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       ushr    v31.4s, v26.4s, #8
+       add     v17.4s, v17.4s, v2.4s
+       shl     v26.4s, v26.4s, #24
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       orr     v26.16b, v26.16b, v31.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v0.16b, v23.16b, v0.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       orr     v0.16b, v0.16b, v31.16b
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v8.4s
+       add     v18.4s, v18.4s, v14.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
+       add     v17.4s, v17.4s, v13.4s
+       add     v18.4s, v18.4s, v0.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v29.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       eor     v6.16b, v6.16b, v18.16b
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
+       rev32   v6.8h, v6.8h
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       add     v3.4s, v3.4s, v6.4s
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       eor     v0.16b, v3.16b, v0.16b
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       ushr    v11.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       orr     v0.16b, v0.16b, v11.16b
+       ushr    v31.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
        add     v20.4s, v20.4s, v28.4s
+       add     v18.4s, v18.4s, v12.4s
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       orr     v1.16b, v1.16b, v31.16b
+       add     v20.4s, v20.4s, v25.4s
        add     v17.4s, v17.4s, v30.4s
-       add     v17.4s, v17.4s, v8.4s
-       eor     v9.16b, v17.16b, v11.16b
-       ushr    v28.4s, v13.4s, #8
-       shl     v11.4s, v13.4s, #24
-       orr     v28.16b, v11.16b, v28.16b
-       ushr    v11.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       orr     v16.16b, v16.16b, v11.16b
-       ushr    v11.4s, v31.4s, #8
-       shl     v31.4s, v31.4s, #24
-       add     v6.4s, v28.4s, v6.4s
-       orr     v31.16b, v31.16b, v11.16b
-       ushr    v11.4s, v9.4s, #8
-       shl     v9.4s, v9.4s, #24
-       add     v2.4s, v16.4s, v2.4s
-       eor     v4.16b, v6.16b, v4.16b
-       orr     v9.16b, v9.16b, v11.16b
-       add     v1.4s, v31.4s, v1.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v11.4s, v4.4s, #7
-       shl     v4.4s, v4.4s, #25
-       add     v5.4s, v9.4s, v5.4s
-       eor     v10.16b, v1.16b, v10.16b
-       orr     v4.16b, v4.16b, v11.16b
-       ushr    v11.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       eor     v8.16b, v5.16b, v8.16b
-       orr     v3.16b, v3.16b, v11.16b
-       ushr    v11.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       orr     v10.16b, v10.16b, v11.16b
-       ushr    v11.4s, v8.4s, #7
-       shl     v8.4s, v8.4s, #25
-       orr     v8.16b, v8.16b, v11.16b
-       add     v29.4s, v29.4s, v8.4s
-       eor     v16.16b, v29.16b, v16.16b
-       add     v0.4s, v0.4s, v4.4s
-       mov     v12.16b, v26.16b
-       add     v17.4s, v17.4s, v19.4s
-       add     v26.4s, v29.4s, v23.4s
-       eor     v29.16b, v0.16b, v31.16b
-       add     v20.4s, v20.4s, v3.4s
-       rev32   v16.8h, v16.8h
-       stur    q18, [x29, #-176]
-       mov     v18.16b, v27.16b
-       add     v0.4s, v0.4s, v24.4s
-       eor     v27.16b, v20.16b, v9.16b
-       add     v17.4s, v17.4s, v10.4s
-       rev32   v24.8h, v29.8h
-       add     v1.4s, v1.4s, v16.4s
+       add     v18.4s, v18.4s, v0.4s
+       orr     v2.16b, v2.16b, v11.16b
+       add     v19.4s, v19.4s, v21.4s
+       eor     v7.16b, v20.16b, v7.16b
+       add     v17.4s, v17.4s, v1.4s
+       eor     v6.16b, v18.16b, v6.16b
+       add     v19.4s, v19.4s, v2.4s
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       orr     v7.16b, v7.16b, v31.16b
+       eor     v26.16b, v19.16b, v26.16b
+       orr     v6.16b, v6.16b, v11.16b
+       ushr    v31.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       ushr    v11.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       add     v4.4s, v7.4s, v4.4s
+       orr     v5.16b, v5.16b, v31.16b
+       add     v3.4s, v6.4s, v3.4s
+       orr     v26.16b, v26.16b, v11.16b
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v5.4s, v23.4s
+       eor     v0.16b, v3.16b, v0.16b
+       add     v16.4s, v26.4s, v16.4s
+       ushr    v31.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       ushr    v11.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       eor     v1.16b, v23.16b, v1.16b
+       orr     v25.16b, v25.16b, v31.16b
+       eor     v2.16b, v16.16b, v2.16b
+       orr     v0.16b, v0.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v20.4s, v20.4s, v15.4s
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v1.16b, v1.16b, v31.16b
+       add     v18.4s, v18.4s, v24.4s
+       add     v20.4s, v20.4s, v0.4s
+       add     v19.4s, v19.4s, v9.4s
+       mov     v8.16b, v13.16b
+       ldur    q13, [x29, #-208]
+       orr     v2.16b, v2.16b, v11.16b
+       add     v18.4s, v18.4s, v1.4s
+       add     v17.4s, v17.4s, v13.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v19.4s, v19.4s, v25.4s
+       eor     v7.16b, v18.16b, v7.16b
+       add     v17.4s, v17.4s, v2.4s
+       rev32   v26.8h, v26.8h
+       eor     v5.16b, v19.16b, v5.16b
+       rev32   v7.8h, v7.8h
+       eor     v6.16b, v17.16b, v6.16b
+       add     v23.4s, v23.4s, v26.4s
+       rev32   v5.8h, v5.8h
+       add     v16.4s, v16.4s, v7.4s
+       rev32   v6.8h, v6.8h
+       eor     v0.16b, v23.16b, v0.16b
+       add     v3.4s, v3.4s, v5.4s
+       eor     v1.16b, v16.16b, v1.16b
+       add     v4.4s, v4.4s, v6.4s
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v25.16b, v3.16b, v25.16b
+       ushr    v11.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       orr     v0.16b, v0.16b, v31.16b
+       eor     v2.16b, v4.16b, v2.16b
+       ushr    v31.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       orr     v1.16b, v1.16b, v11.16b
+       ushr    v11.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v20.4s, v20.4s, v22.4s
+       orr     v25.16b, v25.16b, v31.16b
+       add     v19.4s, v19.4s, v10.4s
+       mov     v27.16b, v12.16b
+       mov     v12.16b, v30.16b
+       mov     v29.16b, v21.16b
+       mov     v21.16b, v24.16b
+       ldr     q24, [sp, #192]
+       mov     v30.16b, v22.16b
+       ldr     q22, [sp, #256]
+       orr     v2.16b, v2.16b, v11.16b
+       add     v20.4s, v20.4s, v0.4s
+       add     v18.4s, v18.4s, v24.4s
+       add     v19.4s, v19.4s, v25.4s
+       add     v17.4s, v17.4s, v22.4s
+       eor     v26.16b, v20.16b, v26.16b
+       add     v18.4s, v18.4s, v1.4s
+       eor     v5.16b, v19.16b, v5.16b
+       add     v17.4s, v17.4s, v2.4s
+       ushr    v31.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       ushr    v11.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       eor     v7.16b, v18.16b, v7.16b
+       eor     v6.16b, v17.16b, v6.16b
+       orr     v26.16b, v26.16b, v31.16b
+       orr     v5.16b, v5.16b, v11.16b
+       ushr    v31.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       ushr    v11.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       add     v23.4s, v26.4s, v23.4s
+       orr     v7.16b, v7.16b, v31.16b
+       add     v3.4s, v5.4s, v3.4s
+       orr     v6.16b, v6.16b, v11.16b
+       eor     v0.16b, v23.16b, v0.16b
+       add     v16.4s, v7.4s, v16.4s
+       eor     v25.16b, v3.16b, v25.16b
+       add     v4.4s, v6.4s, v4.4s
+       ushr    v31.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v11.4s, v25.4s, #7
+       shl     v25.4s, v25.4s, #25
+       eor     v1.16b, v16.16b, v1.16b
+       eor     v2.16b, v4.16b, v2.16b
+       orr     v0.16b, v0.16b, v31.16b
+       orr     v25.16b, v25.16b, v11.16b
+       ushr    v31.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ushr    v11.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       add     v20.4s, v20.4s, v14.4s
+       add     v18.4s, v18.4s, v27.4s
+       ldr     q27, [sp, #224]
+       orr     v1.16b, v1.16b, v31.16b
+       orr     v2.16b, v2.16b, v11.16b
        add     v20.4s, v20.4s, v25.4s
-       eor     v25.16b, v17.16b, v28.16b
-       rev32   v27.8h, v27.8h
-       add     v5.4s, v5.4s, v24.4s
-       eor     v28.16b, v1.16b, v8.16b
-       rev32   v25.8h, v25.8h
-       add     v6.4s, v6.4s, v27.4s
-       eor     v4.16b, v5.16b, v4.16b
-       ushr    v31.4s, v28.4s, #12
-       shl     v28.4s, v28.4s, #20
-       add     v2.4s, v2.4s, v25.4s
-       eor     v3.16b, v6.16b, v3.16b
-       orr     v28.16b, v28.16b, v31.16b
-       ushr    v31.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       eor     v29.16b, v2.16b, v10.16b
-       orr     v4.16b, v4.16b, v31.16b
-       ushr    v31.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       add     v26.4s, v26.4s, v28.4s
-       orr     v3.16b, v3.16b, v31.16b
-       ushr    v31.4s, v29.4s, #12
-       shl     v29.4s, v29.4s, #20
-       eor     v16.16b, v26.16b, v16.16b
-       add     v0.4s, v0.4s, v4.4s
-       add     v17.4s, v17.4s, v12.4s
-       orr     v29.16b, v29.16b, v31.16b
-       eor     v24.16b, v0.16b, v24.16b
-       add     v0.4s, v0.4s, v22.4s
-       add     v20.4s, v20.4s, v3.4s
-       ushr    v22.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       add     v23.4s, v26.4s, v21.4s
-       eor     v21.16b, v20.16b, v27.16b
        add     v17.4s, v17.4s, v29.4s
-       orr     v16.16b, v16.16b, v22.16b
-       ushr    v22.4s, v24.4s, #8
-       shl     v24.4s, v24.4s, #24
-       eor     v25.16b, v17.16b, v25.16b
-       orr     v22.16b, v24.16b, v22.16b
+       add     v18.4s, v18.4s, v0.4s
+       add     v19.4s, v19.4s, v8.4s
+       eor     v7.16b, v7.16b, v20.16b
+       add     v17.4s, v17.4s, v1.4s
+       eor     v6.16b, v6.16b, v18.16b
+       add     v19.4s, v19.4s, v2.4s
+       rev32   v7.8h, v7.8h
+       eor     v5.16b, v17.16b, v5.16b
+       rev32   v6.8h, v6.8h
+       eor     v26.16b, v19.16b, v26.16b
+       add     v4.4s, v4.4s, v7.4s
+       rev32   v5.8h, v5.8h
+       add     v3.4s, v3.4s, v6.4s
+       rev32   v26.8h, v26.8h
+       eor     v25.16b, v4.16b, v25.16b
+       add     v23.4s, v23.4s, v5.4s
+       eor     v0.16b, v3.16b, v0.16b
+       add     v16.4s, v16.4s, v26.4s
+       ushr    v29.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       ushr    v31.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v1.16b, v23.16b, v1.16b
+       eor     v2.16b, v16.16b, v2.16b
+       orr     v25.16b, v25.16b, v29.16b
+       orr     v0.16b, v0.16b, v31.16b
+       ushr    v29.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       ushr    v31.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       add     v18.4s, v18.4s, v21.4s
+       ldr     q21, [sp, #240]
+       add     v20.4s, v20.4s, v27.4s
+       prfm    pldl1keep, [x17, #256]
+       orr     v1.16b, v1.16b, v29.16b
+       prfm    pldl1keep, [x21, #256]
+       orr     v2.16b, v2.16b, v31.16b
+       prfm    pldl1keep, [x16, #256]
+       add     v18.4s, v18.4s, v0.4s
+       prfm    pldl1keep, [x6, #256]
+       add     v17.4s, v17.4s, v21.4s
+       add     v19.4s, v19.4s, v22.4s
+       add     v20.4s, v20.4s, v25.4s
+       eor     v6.16b, v18.16b, v6.16b
+       add     v17.4s, v17.4s, v1.4s
+       add     v19.4s, v19.4s, v2.4s
+       eor     v7.16b, v20.16b, v7.16b
+       ushr    v22.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       eor     v5.16b, v17.16b, v5.16b
+       eor     v26.16b, v19.16b, v26.16b
+       ushr    v21.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       orr     v6.16b, v6.16b, v22.16b
+       ushr    v22.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
+       ushr    v29.4s, v26.4s, #8
+       shl     v26.4s, v26.4s, #24
+       orr     v7.16b, v7.16b, v21.16b
+       orr     v5.16b, v5.16b, v22.16b
+       add     v3.4s, v6.4s, v3.4s
+       orr     v21.16b, v26.16b, v29.16b
+       add     v4.4s, v7.4s, v4.4s
+       add     v22.4s, v5.4s, v23.4s
+       eor     v0.16b, v3.16b, v0.16b
+       add     v16.4s, v21.4s, v16.4s
+       eor     v23.16b, v4.16b, v25.16b
+       eor     v1.16b, v22.16b, v1.16b
+       ushr    v25.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       eor     v2.16b, v16.16b, v2.16b
+       ushr    v26.4s, v23.4s, #7
+       shl     v23.4s, v23.4s, #25
+       orr     v0.16b, v0.16b, v25.16b
+       ushr    v25.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ushr    v29.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       add     v20.4s, v20.4s, v28.4s
+       orr     v23.16b, v23.16b, v26.16b
+       orr     v1.16b, v1.16b, v25.16b
+       orr     v2.16b, v2.16b, v29.16b
+       add     v20.4s, v20.4s, v0.4s
+       add     v18.4s, v18.4s, v13.4s
+       add     v17.4s, v17.4s, v30.4s
+       add     v19.4s, v19.4s, v10.4s
+       eor     v21.16b, v20.16b, v21.16b
+       add     v18.4s, v18.4s, v1.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v19.4s, v19.4s, v23.4s
+       rev32   v21.8h, v21.8h
+       eor     v7.16b, v18.16b, v7.16b
+       eor     v6.16b, v17.16b, v6.16b
+       eor     v5.16b, v19.16b, v5.16b
+       add     v22.4s, v22.4s, v21.4s
+       rev32   v7.8h, v7.8h
+       rev32   v6.8h, v6.8h
+       rev32   v5.8h, v5.8h
+       eor     v0.16b, v22.16b, v0.16b
+       add     v16.4s, v16.4s, v7.4s
+       add     v4.4s, v4.4s, v6.4s
+       add     v3.4s, v3.4s, v5.4s
+       ushr    v25.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v1.16b, v16.16b, v1.16b
+       eor     v2.16b, v4.16b, v2.16b
+       eor     v23.16b, v3.16b, v23.16b
+       orr     v0.16b, v0.16b, v25.16b
+       ushr    v25.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       ushr    v26.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       ushr    v27.4s, v23.4s, #12
+       shl     v23.4s, v23.4s, #20
+       orr     v1.16b, v1.16b, v25.16b
+       add     v20.4s, v20.4s, v24.4s
+       orr     v2.16b, v2.16b, v26.16b
+       orr     v23.16b, v23.16b, v27.16b
+       add     v18.4s, v18.4s, v12.4s
+       add     v17.4s, v17.4s, v9.4s
+       add     v19.4s, v19.4s, v15.4s
+       add     v20.4s, v20.4s, v0.4s
+       add     v18.4s, v18.4s, v1.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v19.4s, v19.4s, v23.4s
+       eor     v21.16b, v20.16b, v21.16b
+       eor     v7.16b, v18.16b, v7.16b
+       eor     v6.16b, v17.16b, v6.16b
+       eor     v5.16b, v19.16b, v5.16b
        ushr    v24.4s, v21.4s, #8
        shl     v21.4s, v21.4s, #24
+       ushr    v25.4s, v7.4s, #8
+       shl     v7.4s, v7.4s, #24
+       ushr    v26.4s, v6.4s, #8
+       shl     v6.4s, v6.4s, #24
+       ushr    v27.4s, v5.4s, #8
+       shl     v5.4s, v5.4s, #24
        orr     v21.16b, v21.16b, v24.16b
-       ushr    v24.4s, v25.4s, #8
-       shl     v25.4s, v25.4s, #24
-       add     v1.4s, v16.4s, v1.4s
-       orr     v24.16b, v25.16b, v24.16b
-       add     v5.4s, v22.4s, v5.4s
-       eor     v25.16b, v1.16b, v28.16b
-       add     v6.4s, v21.4s, v6.4s
-       eor     v4.16b, v5.16b, v4.16b
-       ushr    v27.4s, v25.4s, #7
-       shl     v25.4s, v25.4s, #25
-       add     v2.4s, v24.4s, v2.4s
-       eor     v3.16b, v6.16b, v3.16b
-       orr     v25.16b, v25.16b, v27.16b
-       ushr    v27.4s, v4.4s, #7
-       shl     v4.4s, v4.4s, #25
-       ldur    q19, [x29, #-176]
-       eor     v26.16b, v2.16b, v29.16b
-       orr     v4.16b, v4.16b, v27.16b
-       ushr    v27.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       orr     v3.16b, v3.16b, v27.16b
-       ushr    v27.4s, v26.4s, #7
-       shl     v26.4s, v26.4s, #25
-       add     v20.4s, v20.4s, v18.4s
-       add     v17.4s, v17.4s, v30.4s
-       orr     v26.16b, v26.16b, v27.16b
-       add     v0.4s, v0.4s, v3.4s
-       eor     v16.16b, v0.16b, v16.16b
-       add     v0.4s, v0.4s, v19.4s
-       add     v19.4s, v20.4s, v26.4s
-       add     v17.4s, v17.4s, v25.4s
-       eor     v20.16b, v19.16b, v22.16b
-       add     v7.4s, v19.4s, v7.4s
-       eor     v19.16b, v17.16b, v21.16b
-       ldur    q21, [x29, #-64]
-       add     v23.4s, v23.4s, v4.4s
-       eor     v24.16b, v23.16b, v24.16b
-       rev32   v16.8h, v16.8h
-       add     v17.4s, v17.4s, v21.4s
-       rev32   v21.8h, v24.8h
-       add     v6.4s, v6.4s, v21.4s
-       rev32   v20.8h, v20.8h
-       add     v2.4s, v2.4s, v16.4s
-       eor     v4.16b, v6.16b, v4.16b
-       rev32   v19.8h, v19.8h
-       add     v1.4s, v1.4s, v20.4s
-       eor     v3.16b, v2.16b, v3.16b
-       ushr    v24.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       add     v5.4s, v5.4s, v19.4s
-       eor     v22.16b, v1.16b, v26.16b
-       orr     v4.16b, v4.16b, v24.16b
-       ushr    v24.4s, v3.4s, #12
-       shl     v3.4s, v3.4s, #20
-       add     v18.4s, v23.4s, v14.4s
-       eor     v23.16b, v5.16b, v25.16b
-       orr     v3.16b, v3.16b, v24.16b
-       ushr    v24.4s, v22.4s, #12
-       shl     v22.4s, v22.4s, #20
-       orr     v22.16b, v22.16b, v24.16b
-       ushr    v24.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       orr     v23.16b, v23.16b, v24.16b
-       add     v18.4s, v18.4s, v4.4s
-       add     v0.4s, v0.4s, v3.4s
-       add     v24.4s, v17.4s, v23.4s
-       eor     v17.16b, v18.16b, v21.16b
-       add     v7.4s, v7.4s, v22.4s
-       eor     v16.16b, v0.16b, v16.16b
-       ushr    v21.4s, v17.4s, #8
-       shl     v17.4s, v17.4s, #24
-       eor     v20.16b, v7.16b, v20.16b
-       orr     v21.16b, v17.16b, v21.16b
-       ushr    v17.4s, v16.4s, #8
-       shl     v16.4s, v16.4s, #24
-       eor     v19.16b, v24.16b, v19.16b
-       orr     v16.16b, v16.16b, v17.16b
-       ushr    v17.4s, v20.4s, #8
-       shl     v20.4s, v20.4s, #24
-       orr     v25.16b, v20.16b, v17.16b
-       ushr    v17.4s, v19.4s, #8
-       shl     v19.4s, v19.4s, #24
-       orr     v19.16b, v19.16b, v17.16b
-       add     v1.4s, v25.4s, v1.4s
-       eor     v22.16b, v1.16b, v22.16b
-       eor     v20.16b, v1.16b, v18.16b
-       add     v1.4s, v19.4s, v5.4s
-       eor     v26.16b, v1.16b, v0.16b
-       add     v0.4s, v21.4s, v6.4s
-       eor     v5.16b, v1.16b, v23.16b
-       eor     v1.16b, v0.16b, v4.16b
-       eor     v17.16b, v0.16b, v7.16b
-       add     v0.4s, v16.4s, v2.4s
-       eor     v2.16b, v0.16b, v3.16b
-       eor     v6.16b, v0.16b, v24.16b
-       ushr    v0.4s, v1.4s, #7
+       orr     v7.16b, v7.16b, v25.16b
+       orr     v6.16b, v6.16b, v26.16b
+       orr     v5.16b, v5.16b, v27.16b
+       add     v22.4s, v21.4s, v22.4s
+       add     v16.4s, v7.4s, v16.4s
+       add     v4.4s, v6.4s, v4.4s
+       add     v3.4s, v5.4s, v3.4s
+       eor     v0.16b, v22.16b, v0.16b
+       eor     v1.16b, v16.16b, v1.16b
+       eor     v2.16b, v4.16b, v2.16b
+       eor     v23.16b, v3.16b, v23.16b
+       ushr    v24.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v25.4s, v1.4s, #7
        shl     v1.4s, v1.4s, #25
-       orr     v0.16b, v1.16b, v0.16b
-       ushr    v1.4s, v2.4s, #7
+       ushr    v26.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       orr     v1.16b, v2.16b, v1.16b
-       ushr    v2.4s, v22.4s, #7
-       shl     v3.4s, v22.4s, #25
-       orr     v2.16b, v3.16b, v2.16b
-       ushr    v3.4s, v5.4s, #7
-       shl     v4.4s, v5.4s, #25
-       orr     v3.16b, v4.16b, v3.16b
-       eor     v8.16b, v16.16b, v3.16b
-       eor     v9.16b, v25.16b, v0.16b
-       eor     v31.16b, v1.16b, v19.16b
-       cmp     x17, x22
-       eor     v15.16b, v2.16b, v21.16b
-       mov     w18, w19
-       b.ne    .LBB2_4
-.LBB2_7:
-       zip1    v0.4s, v20.4s, v26.4s
-       zip2    v1.4s, v20.4s, v26.4s
-       zip1    v2.4s, v17.4s, v6.4s
-       zip2    v3.4s, v17.4s, v6.4s
-       zip1    v4.4s, v8.4s, v9.4s
-       zip2    v5.4s, v8.4s, v9.4s
-       zip1    v6.4s, v31.4s, v15.4s
-       zip2    v7.4s, v31.4s, v15.4s
-       add     x13, x20, #4
-       tst     w5, #0x1
-       sub     x28, x28, #4
-       zip1    v16.2d, v0.2d, v2.2d
-       zip2    v0.2d, v0.2d, v2.2d
-       zip1    v2.2d, v1.2d, v3.2d
-       zip2    v1.2d, v1.2d, v3.2d
-       zip1    v3.2d, v4.2d, v6.2d
-       zip2    v4.2d, v4.2d, v6.2d
-       zip1    v6.2d, v5.2d, v7.2d
-       zip2    v5.2d, v5.2d, v7.2d
-       add     x24, x24, #32
-       csel    x20, x13, x20, ne
-       cmp     x28, #3
-       stp     q16, q3, [x26]
-       stp     q0, q4, [x26, #32]
-       stp     q2, q6, [x26, #64]
-       stp     q1, q5, [x26, #96]
-       add     x26, x26, #128
-       b.hi    .LBB2_2
-.LBB2_8:
-       cbz     x28, .LBB2_16
+       ushr    v27.4s, v23.4s, #7
+       shl     v23.4s, v23.4s, #25
+       orr     v0.16b, v0.16b, v24.16b
+       orr     v1.16b, v1.16b, v25.16b
+       orr     v2.16b, v2.16b, v26.16b
+       orr     v23.16b, v23.16b, v27.16b
+       movi    v24.4s, #64
+       eor     v12.16b, v4.16b, v20.16b
+       eor     v31.16b, v18.16b, v3.16b
+       eor     v29.16b, v17.16b, v22.16b
+       eor     v30.16b, v16.16b, v19.16b
+       eor     v28.16b, v7.16b, v23.16b
+       eor     v23.16b, v6.16b, v0.16b
+       eor     v13.16b, v1.16b, v5.16b
+       eor     v25.16b, v2.16b, v21.16b
+       cbnz    x15, .LBB3_5
+       b       .LBB3_2
+.LBB3_6:
+       cbz     x24, .LBB3_14
        orr     w8, w7, w19
-       and     x21, x5, #0x1
-       stur    w8, [x29, #-64]
-.LBB2_10:
+       and     x22, x5, #0x1
+       stur    w8, [x29, #-192]
+.LBB3_8:
        ldr     x8, [sp, #40]
-       ldr     x25, [x24]
-       ldur    w4, [x29, #-64]
-       ldp     q1, q0, [x8]
-       mov     x8, x22
-       stp     q1, q0, [x29, #-48]
-.LBB2_11:
-       subs    x23, x8, #1
-       b.eq    .LBB2_13
-       cbnz    x8, .LBB2_14
-       b       .LBB2_15
-.LBB2_13:
-       orr     w4, w4, w27
-.LBB2_14:
-       sub     x0, x29, #48
-       mov     w2, #64
-       mov     x1, x25
-       mov     x3, x20
-       bl      zfs_blake3_compress_in_place_sse2
+       mov     x28, x0
+       ldr     x25, [x0]
+       mov     x23, x2
+       ldur    w5, [x29, #-192]
+       ldp     q0, q1, [x8]
+       mov     x8, x2
+       b       .LBB3_11
+.LBB3_9:
+       orr     w5, w5, w27
+.LBB3_10:
+       sub     x0, x29, #144
+       sub     x1, x29, #176
+       mov     x2, x25
+       mov     w3, #64
+       mov     x4, x20
+       bl      compress_pre
+       ldp     q0, q1, [x29, #-144]
        add     x25, x25, #64
-       mov     x8, x23
-       mov     w4, w19
-       b       .LBB2_11
-.LBB2_15:
-       ldp     q0, q1, [x29, #-48]
-       add     x20, x20, x21
-       add     x24, x24, #8
-       subs    x28, x28, #1
-       stp     q0, q1, [x26], #32
-       b.ne    .LBB2_10
-.LBB2_16:
-       add     sp, sp, #384
+       mov     x8, x21
+       mov     w5, w19
+       ldp     q2, q3, [x29, #-112]
+       eor     v0.16b, v2.16b, v0.16b
+       eor     v1.16b, v3.16b, v1.16b
+.LBB3_11:
+       subs    x21, x8, #1
+       stp     q0, q1, [x29, #-176]
+       b.eq    .LBB3_9
+       cbnz    x8, .LBB3_10
+       ldp     q1, q0, [x29, #-176]
+       mov     x0, x28
+       add     x20, x20, x22
+       add     x0, x28, #8
+       subs    x24, x24, #1
+       mov     x2, x23
+       stp     q1, q0, [x26], #32
+       b.ne    .LBB3_8
+.LBB3_14:
+       add     sp, sp, #464
        ldp     x20, x19, [sp, #144]
        ldp     x22, x21, [sp, #128]
        ldp     x24, x23, [sp, #112]
@@ -2442,9 +2052,10 @@ zfs_blake3_hash_many_sse2:
        ldp     d11, d10, [sp, #32]
        ldp     d13, d12, [sp, #16]
        ldp     d15, d14, [sp], #160
+       hint    #29
        ret
-.Lfunc_end2:
-       .size   zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+.Lfunc_end3:
+       .size   zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2
        .cfi_endproc
        .section        ".note.GNU-stack","",@progbits
-#endif
+#endif
\ No newline at end of file
index a05baec96ce56dcd1f041a69af7d5324ca9121f1..c4c2dfc5bcde0fc924b067742f46c10ba7af5bd5 100644 (file)
 /*
  * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
  * Copyright (c) 2019-2022 Samuel Neves
- * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * Copyright (c) 2022-2023 Tino Reichardt <milky-zfs@mcmilk.de>
  *
  * This is converted assembly: SSE4.1 -> ARMv8-A
  * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
  */
 
 #if defined(__aarch64__)
        .text
+       .section        .note.gnu.property,"a",@note
+       .p2align        3
+       .word   4
+       .word   16
+       .word   5
+       .asciz  "GNU"
+       .word   3221225472
+       .word   4
+       .word   3
+       .word   0
+.Lsec_end0:
+       .text
+       .globl  zfs_blake3_compress_in_place_sse41
+       .p2align        2
+       .type   zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+       .cfi_startproc
+       hint    #25
+       .cfi_negate_ra_state
+       sub     sp, sp, #96
+       stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
+       str     x19, [sp, #80]
+       .cfi_def_cfa w29, 32
+       .cfi_offset w19, -16
+       .cfi_offset w30, -24
+       .cfi_offset w29, -32
+       mov     x19, x0
+       mov     w5, w4
+       mov     x4, x3
+       mov     w3, w2
+       mov     x2, x1
+       mov     x0, sp
+       mov     x1, x19
+       bl      compress_pre
+       ldp     q0, q1, [sp]
+       ldp     q2, q3, [sp, #32]
+       eor     v0.16b, v2.16b, v0.16b
+       eor     v1.16b, v3.16b, v1.16b
+       ldp     x29, x30, [sp, #64]
+       stp     q0, q1, [x19]
+       ldr     x19, [sp, #80]
+       add     sp, sp, #96
+       hint    #29
+       ret
+.Lfunc_end0:
+       .size   zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+       .cfi_endproc
+
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4
-.LCPI0_0:
+.LCPI1_0:
+       .xword  -4942790177982912921
+       .xword  -6534734903820487822
+.LCPI1_1:
        .byte   2
        .byte   3
        .byte   0
        .byte   15
        .byte   12
        .byte   13
-.LCPI0_1:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI0_2:
+.LCPI1_2:
        .byte   1
        .byte   2
        .byte   3
        .byte   14
        .byte   15
        .byte   12
-.LCPI0_3:
-       .byte   0
-       .byte   1
-       .byte   2
-       .byte   3
-       .byte   20
-       .byte   21
-       .byte   22
-       .byte   23
-       .byte   8
-       .byte   9
-       .byte   10
-       .byte   11
-       .byte   28
-       .byte   29
-       .byte   30
-       .byte   31
-.LCPI0_4:
-       .byte   0
-       .byte   1
-       .byte   2
-       .byte   3
-       .byte   4
-       .byte   5
-       .byte   6
-       .byte   7
-       .byte   8
-       .byte   9
-       .byte   10
-       .byte   11
-       .byte   28
-       .byte   29
-       .byte   30
-       .byte   31
        .text
-       .globl  zfs_blake3_compress_in_place_sse41
        .p2align        2
-       .type   zfs_blake3_compress_in_place_sse41,@function
-zfs_blake3_compress_in_place_sse41:
+       .type   compress_pre,@function
+compress_pre:
        .cfi_startproc
-       ldp     q7, q6, [x0]
-       ldp     q17, q18, [x1]
-       add     x12, x1, #32
-       ld2     { v4.4s, v5.4s }, [x12]
-       lsr     x10, x3, #32
-       fmov    s16, w3
-       adrp    x13, .LCPI0_0
-       adrp    x11, .LCPI0_1
-       and     w8, w2, #0xff
-       mov     v16.s[1], w10
-       ldr     q0, [x13, :lo12:.LCPI0_0]
-       ldr     q20, [x11, :lo12:.LCPI0_1]
-       adrp    x11, .LCPI0_4
-       and     w9, w4, #0xff
-       ldr     q2, [x11, :lo12:.LCPI0_4]
-       mov     v16.s[2], w8
-       uzp1    v21.4s, v17.4s, v18.4s
-       add     v7.4s, v6.4s, v7.4s
-       adrp    x12, .LCPI0_3
-       mov     v16.s[3], w9
-       uzp2    v18.4s, v17.4s, v18.4s
-       add     v7.4s, v7.4s, v21.4s
-       ext     v17.16b, v5.16b, v5.16b, #12
-       ldr     q3, [x12, :lo12:.LCPI0_3]
-       ext     v24.16b, v4.16b, v4.16b, #12
-       eor     v16.16b, v7.16b, v16.16b
-       mov     v27.16b, v17.16b
-       uzp1    v19.4s, v21.4s, v21.4s
-       ext     v25.16b, v21.16b, v21.16b, #12
-       zip2    v28.4s, v18.4s, v17.4s
-       tbl     v29.16b, { v16.16b }, v0.16b
-       mov     v27.s[1], v24.s[2]
-       zip1    v23.2d, v17.2d, v18.2d
-       ext     v19.16b, v19.16b, v21.16b, #8
-       add     v22.4s, v29.4s, v20.4s
-       ext     v26.16b, v21.16b, v25.16b, #12
-       tbl     v20.16b, { v23.16b, v24.16b }, v2.16b
-       zip1    v21.4s, v28.4s, v24.4s
-       zip1    v23.4s, v24.4s, v28.4s
-       uzp2    v19.4s, v19.4s, v18.4s
-       eor     v24.16b, v22.16b, v6.16b
-       ext     v25.16b, v20.16b, v20.16b, #12
-       ext     v6.16b, v23.16b, v21.16b, #8
-       add     v7.4s, v7.4s, v18.4s
-       ext     v18.16b, v19.16b, v19.16b, #4
-       tbl     v16.16b, { v26.16b, v27.16b }, v3.16b
-       uzp1    v21.4s, v20.4s, v25.4s
-       mov     v26.16b, v6.16b
-       ext     v23.16b, v18.16b, v18.16b, #12
-       mov     v26.s[1], v21.s[2]
-       adrp    x10, .LCPI0_2
-       ext     v25.16b, v18.16b, v23.16b, #12
-       uzp1    v23.4s, v18.4s, v18.4s
-       ldr     q1, [x10, :lo12:.LCPI0_2]
-       ext     v18.16b, v23.16b, v18.16b, #8
-       ushr    v23.4s, v24.4s, #12
-       shl     v24.4s, v24.4s, #20
-       orr     v23.16b, v24.16b, v23.16b
-       add     v7.4s, v7.4s, v23.4s
-       eor     v27.16b, v29.16b, v7.16b
-       add     v4.4s, v7.4s, v4.4s
-       tbl     v7.16b, { v25.16b, v26.16b }, v3.16b
-       tbl     v26.16b, { v27.16b }, v1.16b
-       add     v22.4s, v22.4s, v26.4s
-       uzp2    v18.4s, v18.4s, v16.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ext     v5.16b, v18.16b, v18.16b, #4
-       ushr    v27.4s, v23.4s, #7
-       shl     v23.4s, v23.4s, #25
-       uzp1    v25.4s, v5.4s, v5.4s
-       orr     v23.16b, v23.16b, v27.16b
-       ext     v28.16b, v4.16b, v4.16b, #12
-       ext     v4.16b, v25.16b, v5.16b, #8
-       ext     v25.16b, v26.16b, v26.16b, #8
-       add     v26.4s, v28.4s, v23.4s
-       eor     v25.16b, v26.16b, v25.16b
-       ext     v22.16b, v22.16b, v22.16b, #4
-       tbl     v25.16b, { v25.16b }, v0.16b
-       add     v22.4s, v22.4s, v25.4s
-       eor     v23.16b, v23.16b, v22.16b
-       add     v17.4s, v26.4s, v17.4s
-       ushr    v26.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       orr     v23.16b, v23.16b, v26.16b
-       add     v17.4s, v17.4s, v23.4s
-       eor     v25.16b, v25.16b, v17.16b
-       add     v17.4s, v17.4s, v19.4s
-       tbl     v19.16b, { v25.16b }, v1.16b
-       add     v22.4s, v22.4s, v19.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ushr    v25.4s, v23.4s, #7
-       shl     v23.4s, v23.4s, #25
-       ext     v17.16b, v17.16b, v17.16b, #4
-       orr     v23.16b, v23.16b, v25.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v17.4s, v17.4s, v23.4s
-       eor     v19.16b, v17.16b, v19.16b
-       ext     v22.16b, v22.16b, v22.16b, #12
-       tbl     v19.16b, { v19.16b }, v0.16b
-       add     v22.4s, v22.4s, v19.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ushr    v25.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       add     v17.4s, v17.4s, v16.4s
-       orr     v23.16b, v23.16b, v25.16b
-       add     v17.4s, v17.4s, v23.4s
-       ext     v25.16b, v17.16b, v17.16b, #12
-       eor     v17.16b, v19.16b, v17.16b
-       tbl     v17.16b, { v17.16b }, v1.16b
-       add     v19.4s, v22.4s, v17.4s
-       eor     v22.16b, v23.16b, v19.16b
-       add     v25.4s, v25.4s, v21.4s
-       zip1    v20.2d, v6.2d, v16.2d
-       ushr    v23.4s, v22.4s, #7
-       shl     v22.4s, v22.4s, #25
-       zip2    v24.4s, v16.4s, v6.4s
-       tbl     v26.16b, { v20.16b, v21.16b }, v2.16b
-       orr     v22.16b, v22.16b, v23.16b
-       zip1    v16.4s, v24.4s, v21.4s
-       zip1    v20.4s, v21.4s, v24.4s
-       ext     v21.16b, v26.16b, v26.16b, #12
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v25.4s, v25.4s, v22.4s
-       ext     v16.16b, v20.16b, v16.16b, #8
-       uzp1    v21.4s, v26.4s, v21.4s
-       eor     v26.16b, v25.16b, v17.16b
-       ext     v19.16b, v19.16b, v19.16b, #4
-       tbl     v26.16b, { v26.16b }, v0.16b
-       mov     v29.16b, v16.16b
-       add     v19.4s, v19.4s, v26.4s
-       ext     v27.16b, v5.16b, v5.16b, #12
-       mov     v29.s[1], v21.s[2]
-       eor     v22.16b, v22.16b, v19.16b
-       ext     v28.16b, v5.16b, v27.16b, #12
-       ushr    v27.4s, v22.4s, #12
-       shl     v22.4s, v22.4s, #20
-       add     v6.4s, v25.4s, v6.4s
-       orr     v22.16b, v22.16b, v27.16b
-       add     v6.4s, v6.4s, v22.4s
-       eor     v26.16b, v26.16b, v6.16b
-       add     v6.4s, v6.4s, v18.4s
-       tbl     v18.16b, { v26.16b }, v1.16b
-       add     v19.4s, v19.4s, v18.4s
-       eor     v22.16b, v22.16b, v19.16b
-       ushr    v26.4s, v22.4s, #7
-       shl     v22.4s, v22.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v22.16b, v22.16b, v26.16b
-       ext     v18.16b, v18.16b, v18.16b, #8
-       add     v6.4s, v6.4s, v22.4s
-       eor     v18.16b, v6.16b, v18.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       tbl     v18.16b, { v18.16b }, v0.16b
-       add     v19.4s, v19.4s, v18.4s
-       eor     v22.16b, v22.16b, v19.16b
-       ushr    v26.4s, v22.4s, #12
-       shl     v22.4s, v22.4s, #20
-       add     v6.4s, v6.4s, v7.4s
-       orr     v22.16b, v22.16b, v26.16b
-       add     v6.4s, v6.4s, v22.4s
-       ext     v26.16b, v6.16b, v6.16b, #12
-       eor     v6.16b, v18.16b, v6.16b
-       uzp2    v4.4s, v4.4s, v7.4s
-       zip2    v25.4s, v7.4s, v16.4s
-       add     v26.4s, v26.4s, v21.4s
-       zip1    v20.2d, v16.2d, v7.2d
-       tbl     v6.16b, { v6.16b }, v1.16b
-       ext     v24.16b, v4.16b, v4.16b, #4
-       tbl     v27.16b, { v20.16b, v21.16b }, v2.16b
-       zip1    v7.4s, v25.4s, v21.4s
-       zip1    v20.4s, v21.4s, v25.4s
-       add     v18.4s, v19.4s, v6.4s
-       uzp1    v5.4s, v24.4s, v24.4s
-       ext     v21.16b, v27.16b, v27.16b, #12
-       ext     v7.16b, v20.16b, v7.16b, #8
-       eor     v19.16b, v22.16b, v18.16b
-       ext     v5.16b, v5.16b, v24.16b, #8
-       tbl     v17.16b, { v28.16b, v29.16b }, v3.16b
-       uzp1    v21.4s, v27.4s, v21.4s
-       mov     v28.16b, v7.16b
-       ushr    v22.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v23.16b, v24.16b, v24.16b, #12
-       uzp2    v5.4s, v5.4s, v17.4s
-       mov     v28.s[1], v21.s[2]
-       orr     v19.16b, v19.16b, v22.16b
-       ext     v27.16b, v24.16b, v23.16b, #12
-       ext     v23.16b, v5.16b, v5.16b, #4
-       ext     v6.16b, v6.16b, v6.16b, #8
-       ext     v25.16b, v18.16b, v18.16b, #4
-       add     v18.4s, v26.4s, v19.4s
-       uzp1    v24.4s, v23.4s, v23.4s
-       eor     v6.16b, v18.16b, v6.16b
-       ext     v24.16b, v24.16b, v23.16b, #8
-       add     v16.4s, v18.4s, v16.4s
-       tbl     v18.16b, { v27.16b, v28.16b }, v3.16b
-       tbl     v27.16b, { v6.16b }, v0.16b
-       uzp2    v6.4s, v24.4s, v18.4s
-       add     v24.4s, v25.4s, v27.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v19.16b, v19.16b, v25.16b
-       add     v16.4s, v16.4s, v19.4s
-       eor     v25.16b, v27.16b, v16.16b
-       add     v4.4s, v16.4s, v4.4s
-       tbl     v16.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v16.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v19.16b, v19.16b, v25.16b
-       ext     v16.16b, v16.16b, v16.16b, #8
-       add     v4.4s, v4.4s, v19.4s
-       eor     v16.16b, v4.16b, v16.16b
-       ext     v24.16b, v24.16b, v24.16b, #12
-       tbl     v25.16b, { v16.16b }, v0.16b
-       add     v24.4s, v24.4s, v25.4s
-       eor     v16.16b, v19.16b, v24.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       add     v4.4s, v4.4s, v17.4s
-       orr     v19.16b, v16.16b, v19.16b
-       add     v27.4s, v4.4s, v19.4s
-       eor     v25.16b, v25.16b, v27.16b
-       tbl     v25.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v25.4s
-       zip2    v26.4s, v17.4s, v7.4s
-       ext     v4.16b, v27.16b, v27.16b, #12
-       eor     v19.16b, v19.16b, v24.16b
-       add     v28.4s, v4.4s, v21.4s
-       zip1    v20.2d, v7.2d, v17.2d
-       zip1    v4.4s, v26.4s, v21.4s
-       zip1    v17.4s, v21.4s, v26.4s
-       ushr    v26.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v26.16b
-       ext     v25.16b, v25.16b, v25.16b, #8
-       add     v27.4s, v28.4s, v19.4s
-       eor     v25.16b, v27.16b, v25.16b
-       ext     v24.16b, v24.16b, v24.16b, #4
-       tbl     v25.16b, { v25.16b }, v0.16b
-       add     v24.4s, v24.4s, v25.4s
-       eor     v19.16b, v19.16b, v24.16b
-       add     v7.4s, v27.4s, v7.4s
-       ushr    v27.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v19.16b, v19.16b, v27.16b
-       add     v7.4s, v7.4s, v19.4s
-       eor     v25.16b, v25.16b, v7.16b
-       add     v5.4s, v7.4s, v5.4s
-       tbl     v7.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v7.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v19.16b, v19.16b, v25.16b
+       hint    #34
+       fmov    s1, w3
+       movi    d0, #0x0000ff000000ff
+       ldr     q2, [x1]
+       adrp    x8, .LCPI1_0
+       mov     v1.s[1], w5
+       str     q2, [x0]
+       ldr     q4, [x8, :lo12:.LCPI1_0]
+       ldr     q5, [x1, #16]
+       adrp    x8, .LCPI1_1
+       and     v0.8b, v1.8b, v0.8b
+       fmov    d1, x4
+       stp     q5, q4, [x0, #16]
+       mov     v1.d[1], v0.d[0]
+       str     q1, [x0, #48]
+       ldp     q6, q7, [x2]
+       uzp1    v3.4s, v6.4s, v7.4s
+       add     v0.4s, v2.4s, v3.4s
+       uzp2    v2.4s, v6.4s, v7.4s
+       add     v16.4s, v0.4s, v5.4s
+       ldr     q0, [x8, :lo12:.LCPI1_1]
+       adrp    x8, .LCPI1_2
+       eor     v1.16b, v16.16b, v1.16b
+       add     v7.4s, v16.4s, v2.4s
+       tbl     v1.16b, { v1.16b }, v0.16b
+       add     v4.4s, v1.4s, v4.4s
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v6.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v6.16b
+       add     v6.4s, v7.4s, v5.4s
+       eor     v7.16b, v1.16b, v6.16b
+       ldr     q1, [x8, :lo12:.LCPI1_2]
+       add     x8, x2, #32
+       tbl     v7.16b, { v7.16b }, v1.16b
+       ld2     { v16.4s, v17.4s }, [x8]
+       add     v4.4s, v4.4s, v7.4s
        ext     v7.16b, v7.16b, v7.16b, #8
-       add     v5.4s, v5.4s, v19.4s
-       eor     v7.16b, v5.16b, v7.16b
-       ext     v24.16b, v24.16b, v24.16b, #12
-       tbl     v7.16b, { v7.16b }, v0.16b
-       add     v24.4s, v24.4s, v7.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       tbl     v16.16b, { v20.16b, v21.16b }, v2.16b
-       add     v5.4s, v5.4s, v18.4s
-       orr     v19.16b, v19.16b, v25.16b
-       ext     v20.16b, v16.16b, v16.16b, #12
-       ext     v4.16b, v17.16b, v4.16b, #8
-       add     v5.4s, v5.4s, v19.4s
-       uzp1    v21.4s, v16.4s, v20.4s
-       mov     v17.16b, v4.16b
-       ext     v25.16b, v5.16b, v5.16b, #12
-       mov     v17.s[1], v21.s[2]
-       add     v25.4s, v25.4s, v21.4s
-       zip1    v20.2d, v4.2d, v18.2d
-       ext     v22.16b, v23.16b, v23.16b, #12
-       zip2    v26.4s, v18.4s, v4.4s
-       tbl     v18.16b, { v20.16b, v21.16b }, v2.16b
-       eor     v5.16b, v7.16b, v5.16b
-       ext     v16.16b, v23.16b, v22.16b, #12
-       ext     v22.16b, v6.16b, v6.16b, #4
-       zip1    v27.4s, v26.4s, v21.4s
-       zip1    v20.4s, v21.4s, v26.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       tbl     v5.16b, { v5.16b }, v1.16b
-       ext     v20.16b, v20.16b, v27.16b, #8
-       uzp1    v27.4s, v18.4s, v21.4s
-       uzp1    v18.4s, v22.4s, v22.4s
-       add     v21.4s, v24.4s, v5.4s
-       ext     v18.16b, v18.16b, v22.16b, #8
-       eor     v19.16b, v19.16b, v21.16b
-       tbl     v7.16b, { v16.16b, v17.16b }, v3.16b
-       uzp2    v18.4s, v18.4s, v17.4s
-       zip2    v16.4s, v16.4s, v20.4s
-       ushr    v17.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v17.16b, v19.16b, v17.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v19.4s, v25.4s, v17.4s
-       eor     v5.16b, v19.16b, v5.16b
-       ext     v21.16b, v21.16b, v21.16b, #4
-       tbl     v5.16b, { v5.16b }, v0.16b
-       add     v4.4s, v19.4s, v4.4s
-       add     v19.4s, v21.4s, v5.4s
-       eor     v17.16b, v17.16b, v19.16b
-       ushr    v21.4s, v17.4s, #12
-       shl     v17.4s, v17.4s, #20
-       orr     v17.16b, v17.16b, v21.16b
-       add     v4.4s, v4.4s, v17.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
-       add     v4.4s, v4.4s, v6.4s
-       add     v6.4s, v19.4s, v5.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ushr    v19.4s, v17.4s, #7
-       shl     v17.4s, v17.4s, #25
-       ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v17.16b, v17.16b, v19.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v4.4s, v17.4s
+       add     v6.4s, v6.4s, v16.4s
        eor     v5.16b, v4.16b, v5.16b
+       ext     v4.16b, v4.16b, v4.16b, #4
+       ext     v16.16b, v16.16b, v16.16b, #12
        ext     v6.16b, v6.16b, v6.16b, #12
-       tbl     v5.16b, { v5.16b }, v0.16b
+       ushr    v18.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v18.16b
+       ext     v18.16b, v17.16b, v17.16b, #12
        add     v6.4s, v6.4s, v5.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ushr    v19.4s, v17.4s, #12
-       shl     v17.4s, v17.4s, #20
+       mov     v17.16b, v18.16b
+       eor     v7.16b, v7.16b, v6.16b
+       add     v6.4s, v6.4s, v18.4s
+       mov     v17.s[1], v16.s[2]
+       tbl     v7.16b, { v7.16b }, v0.16b
        add     v4.4s, v4.4s, v7.4s
-       orr     v17.16b, v17.16b, v19.16b
-       add     v4.4s, v4.4s, v17.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
-       mov     v29.16b, v20.16b
+       eor     v5.16b, v4.16b, v5.16b
+       ushr    v19.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v19.16b
+       uzp1    v19.4s, v3.4s, v3.4s
+       add     v6.4s, v6.4s, v5.4s
+       ext     v19.16b, v19.16b, v3.16b, #8
+       eor     v7.16b, v7.16b, v6.16b
+       uzp2    v19.4s, v19.4s, v2.4s
+       tbl     v7.16b, { v7.16b }, v1.16b
+       add     v6.4s, v6.4s, v19.4s
+       add     v4.4s, v4.4s, v7.4s
+       ext     v6.16b, v6.16b, v6.16b, #4
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v5.16b, v4.16b, v5.16b
        ext     v4.16b, v4.16b, v4.16b, #12
+       ushr    v20.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v20.16b
+       ext     v20.16b, v3.16b, v3.16b, #12
        add     v6.4s, v6.4s, v5.4s
-       mov     v29.s[1], v27.s[2]
-       add     v4.4s, v4.4s, v27.4s
-       zip1    v26.2d, v20.2d, v7.2d
-       zip1    v7.4s, v16.4s, v27.4s
-       zip1    v16.4s, v27.4s, v16.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ext     v7.16b, v16.16b, v7.16b, #8
-       ushr    v16.4s, v17.4s, #7
-       shl     v17.4s, v17.4s, #25
-       orr     v16.16b, v17.16b, v16.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v4.4s, v16.4s
+       ext     v3.16b, v3.16b, v20.16b, #12
+       eor     v7.16b, v7.16b, v6.16b
+       rev64   v3.4s, v3.4s
+       tbl     v7.16b, { v7.16b }, v0.16b
+       trn2    v3.4s, v3.4s, v17.4s
+       add     v4.4s, v4.4s, v7.4s
+       add     v6.4s, v6.4s, v3.4s
        eor     v5.16b, v4.16b, v5.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       tbl     v5.16b, { v5.16b }, v0.16b
+       ushr    v17.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v17.16b
+       zip1    v17.2d, v18.2d, v2.2d
+       zip2    v2.4s, v2.4s, v18.4s
        add     v6.4s, v6.4s, v5.4s
-       eor     v16.16b, v16.16b, v6.16b
-       ushr    v17.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       add     v4.4s, v4.4s, v20.4s
-       orr     v16.16b, v16.16b, v17.16b
-       add     v4.4s, v4.4s, v16.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
+       mov     v17.s[3], v16.s[3]
+       zip1    v18.4s, v2.4s, v16.4s
+       zip1    v2.4s, v16.4s, v2.4s
+       eor     v7.16b, v7.16b, v6.16b
+       ext     v6.16b, v6.16b, v6.16b, #12
+       ext     v16.16b, v2.16b, v18.16b, #8
+       tbl     v7.16b, { v7.16b }, v1.16b
+       add     v20.4s, v4.4s, v7.4s
+       ext     v4.16b, v17.16b, v17.16b, #12
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v5.16b, v20.16b, v5.16b
+       uzp1    v4.4s, v17.4s, v4.4s
+       ushr    v17.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v6.4s, v6.4s, v4.4s
+       orr     v5.16b, v5.16b, v17.16b
+       ext     v17.16b, v20.16b, v20.16b, #4
        add     v6.4s, v6.4s, v5.4s
-       eor     v16.16b, v16.16b, v6.16b
+       eor     v7.16b, v7.16b, v6.16b
+       add     v6.4s, v6.4s, v16.4s
+       tbl     v7.16b, { v7.16b }, v0.16b
+       add     v17.4s, v17.4s, v7.4s
+       eor     v5.16b, v17.16b, v5.16b
+       ushr    v2.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v2.16b, v5.16b, v2.16b
+       add     v5.4s, v6.4s, v2.4s
+       ext     v6.16b, v19.16b, v19.16b, #4
+       eor     v7.16b, v7.16b, v5.16b
+       uzp1    v18.4s, v6.4s, v6.4s
+       tbl     v7.16b, { v7.16b }, v1.16b
+       ext     v18.16b, v18.16b, v6.16b, #8
+       add     v17.4s, v17.4s, v7.4s
+       uzp2    v18.4s, v18.4s, v3.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v2.16b, v17.16b, v2.16b
+       add     v5.4s, v5.4s, v18.4s
+       ext     v17.16b, v17.16b, v17.16b, #12
+       ushr    v19.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       ext     v5.16b, v5.16b, v5.16b, #4
+       orr     v2.16b, v2.16b, v19.16b
+       ext     v19.16b, v6.16b, v6.16b, #12
+       add     v5.4s, v5.4s, v2.4s
+       ext     v6.16b, v6.16b, v19.16b, #12
+       mov     v19.16b, v16.16b
+       eor     v7.16b, v7.16b, v5.16b
+       rev64   v6.4s, v6.4s
+       mov     v19.s[1], v4.s[2]
+       tbl     v7.16b, { v7.16b }, v0.16b
+       add     v17.4s, v17.4s, v7.4s
+       eor     v20.16b, v17.16b, v2.16b
+       trn2    v2.4s, v6.4s, v19.4s
+       ushr    v6.4s, v20.4s, #12
+       shl     v19.4s, v20.4s, #20
+       add     v5.4s, v5.4s, v2.4s
+       orr     v6.16b, v19.16b, v6.16b
+       add     v19.4s, v5.4s, v6.4s
+       eor     v5.16b, v7.16b, v19.16b
+       zip1    v7.2d, v16.2d, v3.2d
+       zip2    v3.4s, v3.4s, v16.4s
+       tbl     v20.16b, { v5.16b }, v1.16b
+       mov     v7.s[3], v4.s[3]
+       add     v17.4s, v17.4s, v20.4s
+       ext     v5.16b, v7.16b, v7.16b, #12
+       eor     v6.16b, v17.16b, v6.16b
+       uzp1    v5.4s, v7.4s, v5.4s
+       ext     v7.16b, v19.16b, v19.16b, #12
+       ext     v17.16b, v17.16b, v17.16b, #4
+       ushr    v19.4s, v6.4s, #7
+       shl     v6.4s, v6.4s, #25
+       add     v7.4s, v7.4s, v5.4s
+       orr     v6.16b, v6.16b, v19.16b
+       ext     v19.16b, v20.16b, v20.16b, #8
+       add     v7.4s, v7.4s, v6.4s
+       eor     v19.16b, v19.16b, v7.16b
+       tbl     v19.16b, { v19.16b }, v0.16b
+       add     v16.4s, v17.4s, v19.4s
+       zip1    v17.4s, v3.4s, v4.4s
+       zip1    v3.4s, v4.4s, v3.4s
+       eor     v4.16b, v16.16b, v6.16b
+       ext     v17.16b, v3.16b, v17.16b, #8
+       ushr    v3.4s, v4.4s, #12
+       shl     v4.4s, v4.4s, #20
+       add     v6.4s, v7.4s, v17.4s
+       orr     v3.16b, v4.16b, v3.16b
+       add     v4.4s, v6.4s, v3.4s
+       ext     v6.16b, v18.16b, v18.16b, #4
+       eor     v7.16b, v19.16b, v4.16b
+       uzp1    v18.4s, v6.4s, v6.4s
+       tbl     v7.16b, { v7.16b }, v1.16b
+       ext     v18.16b, v18.16b, v6.16b, #8
+       add     v16.4s, v16.4s, v7.4s
+       uzp2    v18.4s, v18.4s, v2.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v3.16b, v16.16b, v3.16b
        add     v4.4s, v4.4s, v18.4s
-       ushr    v17.4s, v16.4s, #7
-       shl     v16.4s, v16.4s, #25
-       ext     v23.16b, v22.16b, v22.16b, #12
+       ext     v16.16b, v16.16b, v16.16b, #12
+       ushr    v19.4s, v3.4s, #7
+       shl     v3.4s, v3.4s, #25
        ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v16.16b, v16.16b, v17.16b
-       ext     v28.16b, v22.16b, v23.16b, #12
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v16.4s, v4.4s
-       tbl     v3.16b, { v28.16b, v29.16b }, v3.16b
-       eor     v5.16b, v4.16b, v5.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v4.4s, v3.4s
-       tbl     v4.16b, { v5.16b }, v0.16b
-       add     v5.4s, v6.4s, v4.4s
-       eor     v6.16b, v16.16b, v5.16b
-       ushr    v16.4s, v6.4s, #12
+       orr     v3.16b, v3.16b, v19.16b
+       ext     v19.16b, v6.16b, v6.16b, #12
+       add     v4.4s, v4.4s, v3.4s
+       ext     v6.16b, v6.16b, v19.16b, #12
+       mov     v19.16b, v17.16b
+       eor     v7.16b, v7.16b, v4.16b
+       rev64   v6.4s, v6.4s
+       mov     v19.s[1], v5.s[2]
+       tbl     v7.16b, { v7.16b }, v0.16b
+       add     v16.4s, v16.4s, v7.4s
+       eor     v20.16b, v16.16b, v3.16b
+       trn2    v3.4s, v6.4s, v19.4s
+       ushr    v6.4s, v20.4s, #12
+       shl     v19.4s, v20.4s, #20
+       add     v4.4s, v4.4s, v3.4s
+       orr     v6.16b, v19.16b, v6.16b
+       zip1    v19.2d, v17.2d, v2.2d
+       zip2    v2.4s, v2.4s, v17.4s
+       add     v4.4s, v4.4s, v6.4s
+       mov     v19.s[3], v5.s[3]
+       zip1    v17.4s, v2.4s, v5.4s
+       zip1    v2.4s, v5.4s, v2.4s
+       eor     v7.16b, v7.16b, v4.16b
+       ext     v20.16b, v19.16b, v19.16b, #12
+       ext     v4.16b, v4.16b, v4.16b, #12
+       ext     v2.16b, v2.16b, v17.16b, #8
+       tbl     v7.16b, { v7.16b }, v1.16b
+       add     v16.4s, v16.4s, v7.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v21.16b, v16.16b, v6.16b
+       uzp1    v6.4s, v19.4s, v20.4s
+       ext     v16.16b, v16.16b, v16.16b, #4
+       ushr    v19.4s, v21.4s, #7
+       shl     v20.4s, v21.4s, #25
+       add     v4.4s, v4.4s, v6.4s
+       orr     v19.16b, v20.16b, v19.16b
+       add     v4.4s, v4.4s, v19.4s
+       eor     v7.16b, v7.16b, v4.16b
+       add     v4.4s, v4.4s, v2.4s
+       tbl     v7.16b, { v7.16b }, v0.16b
+       add     v16.4s, v16.4s, v7.4s
+       eor     v5.16b, v16.16b, v19.16b
+       ushr    v17.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v17.16b
+       ext     v17.16b, v18.16b, v18.16b, #4
+       add     v4.4s, v4.4s, v5.4s
+       uzp1    v18.4s, v17.4s, v17.4s
+       eor     v7.16b, v7.16b, v4.16b
+       ext     v18.16b, v18.16b, v17.16b, #8
+       tbl     v7.16b, { v7.16b }, v1.16b
+       uzp2    v18.4s, v18.4s, v3.4s
+       add     v16.4s, v16.4s, v7.4s
+       add     v4.4s, v4.4s, v18.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v5.16b, v16.16b, v5.16b
+       ext     v4.16b, v4.16b, v4.16b, #4
+       ext     v16.16b, v16.16b, v16.16b, #12
+       ushr    v19.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v19.16b
+       add     v19.4s, v4.4s, v5.4s
+       eor     v4.16b, v7.16b, v19.16b
+       ext     v7.16b, v17.16b, v17.16b, #12
+       tbl     v20.16b, { v4.16b }, v0.16b
+       ext     v4.16b, v17.16b, v7.16b, #12
+       mov     v7.16b, v2.16b
+       add     v16.4s, v16.4s, v20.4s
+       rev64   v4.4s, v4.4s
+       mov     v7.s[1], v6.s[2]
+       eor     v5.16b, v16.16b, v5.16b
+       trn2    v4.4s, v4.4s, v7.4s
+       ushr    v7.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v19.4s, v4.4s
+       zip1    v19.2d, v2.2d, v3.2d
+       zip2    v2.4s, v3.4s, v2.4s
+       orr     v5.16b, v5.16b, v7.16b
+       mov     v19.s[3], v6.s[3]
+       add     v7.4s, v17.4s, v5.4s
+       eor     v17.16b, v20.16b, v7.16b
+       ext     v20.16b, v19.16b, v19.16b, #12
+       ext     v7.16b, v7.16b, v7.16b, #12
+       tbl     v17.16b, { v17.16b }, v1.16b
+       add     v16.4s, v16.4s, v17.4s
+       ext     v17.16b, v17.16b, v17.16b, #8
+       eor     v21.16b, v16.16b, v5.16b
+       uzp1    v5.4s, v19.4s, v20.4s
+       ext     v16.16b, v16.16b, v16.16b, #4
+       ushr    v19.4s, v21.4s, #7
+       shl     v20.4s, v21.4s, #25
+       add     v7.4s, v7.4s, v5.4s
+       orr     v19.16b, v20.16b, v19.16b
+       add     v7.4s, v7.4s, v19.4s
+       eor     v17.16b, v17.16b, v7.16b
+       tbl     v17.16b, { v17.16b }, v0.16b
+       add     v3.4s, v16.4s, v17.4s
+       zip1    v16.4s, v2.4s, v6.4s
+       zip1    v2.4s, v6.4s, v2.4s
+       eor     v6.16b, v3.16b, v19.16b
+       ext     v16.16b, v2.16b, v16.16b, #8
+       ushr    v2.4s, v6.4s, #12
        shl     v6.4s, v6.4s, #20
-       orr     v6.16b, v6.16b, v16.16b
-       tbl     v2.16b, { v26.16b, v27.16b }, v2.16b
-       add     v3.4s, v3.4s, v6.4s
-       ext     v19.16b, v2.16b, v2.16b, #12
-       eor     v4.16b, v4.16b, v3.16b
-       uzp1    v2.4s, v2.4s, v19.4s
+       add     v7.4s, v7.4s, v16.4s
+       orr     v2.16b, v6.16b, v2.16b
+       add     v6.4s, v7.4s, v2.4s
+       ext     v7.16b, v18.16b, v18.16b, #4
+       eor     v17.16b, v17.16b, v6.16b
+       uzp1    v18.4s, v7.4s, v7.4s
+       tbl     v17.16b, { v17.16b }, v1.16b
+       ext     v18.16b, v18.16b, v7.16b, #8
+       add     v3.4s, v3.4s, v17.4s
+       uzp2    v18.4s, v18.4s, v4.4s
+       eor     v2.16b, v3.16b, v2.16b
+       add     v6.4s, v6.4s, v18.4s
        ext     v3.16b, v3.16b, v3.16b, #12
-       tbl     v4.16b, { v4.16b }, v1.16b
-       add     v2.4s, v3.4s, v2.4s
-       add     v3.4s, v5.4s, v4.4s
-       eor     v5.16b, v6.16b, v3.16b
-       ushr    v6.4s, v5.4s, #7
+       ext     v18.16b, v18.16b, v18.16b, #4
+       ushr    v19.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       ext     v6.16b, v6.16b, v6.16b, #4
+       orr     v19.16b, v2.16b, v19.16b
+       ext     v2.16b, v17.16b, v17.16b, #8
+       ext     v17.16b, v7.16b, v7.16b, #12
+       add     v6.4s, v6.4s, v19.4s
+       eor     v2.16b, v2.16b, v6.16b
+       tbl     v20.16b, { v2.16b }, v0.16b
+       ext     v2.16b, v7.16b, v17.16b, #12
+       mov     v7.16b, v16.16b
+       add     v17.4s, v3.4s, v20.4s
+       rev64   v3.4s, v2.4s
+       mov     v7.s[1], v5.s[2]
+       eor     v19.16b, v17.16b, v19.16b
+       trn2    v3.4s, v3.4s, v7.4s
+       ushr    v21.4s, v19.4s, #12
+       shl     v19.4s, v19.4s, #20
+       add     v6.4s, v6.4s, v3.4s
+       orr     v19.16b, v19.16b, v21.16b
+       add     v21.4s, v6.4s, v19.4s
+       eor     v6.16b, v20.16b, v21.16b
+       zip1    v20.2d, v16.2d, v4.2d
+       zip2    v4.4s, v4.4s, v16.4s
+       tbl     v22.16b, { v6.16b }, v1.16b
+       mov     v20.s[3], v5.s[3]
+       add     v17.4s, v17.4s, v22.4s
+       ext     v6.16b, v20.16b, v20.16b, #12
+       eor     v19.16b, v17.16b, v19.16b
+       uzp1    v6.4s, v20.4s, v6.4s
+       ext     v20.16b, v21.16b, v21.16b, #12
+       ext     v17.16b, v17.16b, v17.16b, #4
+       ushr    v21.4s, v19.4s, #7
+       shl     v19.4s, v19.4s, #25
+       add     v20.4s, v20.4s, v6.4s
+       orr     v19.16b, v19.16b, v21.16b
+       ext     v21.16b, v22.16b, v22.16b, #8
+       add     v20.4s, v20.4s, v19.4s
+       eor     v21.16b, v21.16b, v20.16b
+       tbl     v21.16b, { v21.16b }, v0.16b
+       add     v16.4s, v17.4s, v21.4s
+       zip1    v17.4s, v4.4s, v5.4s
+       zip1    v4.4s, v5.4s, v4.4s
+       eor     v5.16b, v16.16b, v19.16b
+       ext     v4.16b, v4.16b, v17.16b, #8
+       ushr    v17.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v19.4s, v20.4s, v4.4s
+       ext     v20.16b, v18.16b, v18.16b, #8
+       zip1    v3.2d, v4.2d, v3.2d
+       orr     v5.16b, v5.16b, v17.16b
+       zip2    v2.4s, v2.4s, v4.4s
+       uzp2    v7.4s, v20.4s, v7.4s
+       mov     v3.s[3], v6.s[3]
+       add     v17.4s, v19.4s, v5.4s
+       ext     v7.16b, v7.16b, v20.16b, #4
+       eor     v19.16b, v21.16b, v17.16b
+       ext     v17.16b, v17.16b, v17.16b, #4
+       tbl     v19.16b, { v19.16b }, v1.16b
+       add     v7.4s, v17.4s, v7.4s
+       add     v16.4s, v16.4s, v19.4s
+       ext     v17.16b, v19.16b, v19.16b, #8
+       ext     v19.16b, v18.16b, v18.16b, #12
+       eor     v5.16b, v16.16b, v5.16b
+       ext     v16.16b, v16.16b, v16.16b, #12
+       ext     v18.16b, v18.16b, v19.16b, #12
+       mov     v19.16b, v4.16b
+       ushr    v20.4s, v5.4s, #7
        shl     v5.4s, v5.4s, #25
-       orr     v5.16b, v5.16b, v6.16b
-       ext     v4.16b, v4.16b, v4.16b, #8
-       add     v2.4s, v2.4s, v5.4s
-       eor     v4.16b, v2.16b, v4.16b
-       ext     v3.16b, v3.16b, v3.16b, #4
-       tbl     v0.16b, { v4.16b }, v0.16b
-       add     v3.4s, v3.4s, v0.4s
-       eor     v4.16b, v5.16b, v3.16b
-       ushr    v5.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       add     v2.4s, v2.4s, v7.4s
-       orr     v4.16b, v4.16b, v5.16b
-       add     v2.4s, v2.4s, v4.4s
+       rev64   v18.4s, v18.4s
+       mov     v19.s[1], v6.s[2]
+       orr     v5.16b, v5.16b, v20.16b
+       trn2    v18.4s, v18.4s, v19.4s
+       add     v7.4s, v5.4s, v7.4s
+       eor     v17.16b, v17.16b, v7.16b
+       add     v7.4s, v7.4s, v18.4s
+       ext     v18.16b, v3.16b, v3.16b, #12
+       tbl     v17.16b, { v17.16b }, v0.16b
+       uzp1    v3.4s, v3.4s, v18.4s
+       add     v16.4s, v16.4s, v17.4s
+       eor     v5.16b, v16.16b, v5.16b
+       ushr    v19.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v5.16b, v5.16b, v19.16b
+       add     v7.4s, v7.4s, v5.4s
+       eor     v17.16b, v17.16b, v7.16b
+       ext     v7.16b, v7.16b, v7.16b, #12
+       tbl     v17.16b, { v17.16b }, v1.16b
+       add     v3.4s, v7.4s, v3.4s
+       add     v16.4s, v16.4s, v17.4s
+       ext     v7.16b, v17.16b, v17.16b, #8
+       eor     v5.16b, v16.16b, v5.16b
+       ext     v16.16b, v16.16b, v16.16b, #4
+       ushr    v18.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v5.16b, v5.16b, v18.16b
+       add     v3.4s, v3.4s, v5.4s
+       eor     v7.16b, v7.16b, v3.16b
+       tbl     v0.16b, { v7.16b }, v0.16b
+       zip1    v7.4s, v2.4s, v6.4s
+       zip1    v2.4s, v6.4s, v2.4s
+       add     v4.4s, v16.4s, v0.4s
+       ext     v2.16b, v2.16b, v7.16b, #8
+       eor     v5.16b, v4.16b, v5.16b
+       add     v2.4s, v3.4s, v2.4s
+       ushr    v6.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       orr     v3.16b, v5.16b, v6.16b
+       add     v2.4s, v2.4s, v3.4s
        eor     v0.16b, v0.16b, v2.16b
-       tbl     v0.16b, { v0.16b }, v1.16b
-       add     v1.4s, v3.4s, v0.4s
-       eor     v3.16b, v4.16b, v1.16b
        ext     v2.16b, v2.16b, v2.16b, #4
+       tbl     v0.16b, { v0.16b }, v1.16b
+       add     v1.4s, v4.4s, v0.4s
+       ext     v0.16b, v0.16b, v0.16b, #8
+       eor     v3.16b, v1.16b, v3.16b
        ext     v1.16b, v1.16b, v1.16b, #12
        ushr    v4.4s, v3.4s, #7
        shl     v3.4s, v3.4s, #25
-       ext     v0.16b, v0.16b, v0.16b, #8
-       eor     v1.16b, v2.16b, v1.16b
-       orr     v2.16b, v3.16b, v4.16b
+       stp     q1, q0, [x0, #32]
+       orr     v3.16b, v3.16b, v4.16b
+       stp     q2, q3, [x0]
+       ret
+.Lfunc_end1:
+       .size   compress_pre, .Lfunc_end1-compress_pre
+       .cfi_endproc
+
+       .globl  zfs_blake3_compress_xof_sse41
+       .p2align        2
+       .type   zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+       .cfi_startproc
+       hint    #25
+       .cfi_negate_ra_state
+       sub     sp, sp, #96
+       stp     x29, x30, [sp, #64]
+       add     x29, sp, #64
+       stp     x20, x19, [sp, #80]
+       .cfi_def_cfa w29, 32
+       .cfi_offset w19, -8
+       .cfi_offset w20, -16
+       .cfi_offset w30, -24
+       .cfi_offset w29, -32
+       mov     x20, x0
+       mov     x19, x5
+       mov     w5, w4
+       mov     x4, x3
+       mov     w3, w2
+       mov     x2, x1
+       mov     x0, sp
+       mov     x1, x20
+       bl      compress_pre
+       ldp     q0, q1, [sp]
+       ldp     q2, q3, [sp, #32]
        eor     v0.16b, v2.16b, v0.16b
-       stp     q1, q0, [x0]
+       eor     v1.16b, v3.16b, v1.16b
+       ldp     x29, x30, [sp, #64]
+       stp     q0, q1, [x19]
+       ldr     q0, [x20]
+       eor     v0.16b, v0.16b, v2.16b
+       str     q0, [x19, #32]
+       ldr     q0, [x20, #16]
+       eor     v0.16b, v0.16b, v3.16b
+       str     q0, [x19, #48]
+       ldp     x20, x19, [sp, #80]
+       add     sp, sp, #96
+       hint    #29
        ret
-.Lfunc_end0:
-       .size   zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+.Lfunc_end2:
+       .size   zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41
        .cfi_endproc
 
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4
-.LCPI1_0:
+.LCPI3_0:
+       .word   0
+       .word   1
+       .word   2
+       .word   3
+.LCPI3_1:
        .byte   2
        .byte   3
        .byte   0
@@ -558,12 +628,7 @@ zfs_blake3_compress_in_place_sse41:
        .byte   15
        .byte   12
        .byte   13
-.LCPI1_1:
-       .word   1779033703
-       .word   3144134277
-       .word   1013904242
-       .word   2773480762
-.LCPI1_2:
+.LCPI3_2:
        .byte   1
        .byte   2
        .byte   3
@@ -580,540 +645,29 @@ zfs_blake3_compress_in_place_sse41:
        .byte   14
        .byte   15
        .byte   12
-.LCPI1_3:
-       .byte   0
-       .byte   1
-       .byte   2
-       .byte   3
-       .byte   20
-       .byte   21
-       .byte   22
-       .byte   23
-       .byte   8
-       .byte   9
-       .byte   10
-       .byte   11
-       .byte   28
-       .byte   29
-       .byte   30
-       .byte   31
-.LCPI1_4:
-       .byte   0
-       .byte   1
-       .byte   2
-       .byte   3
-       .byte   4
-       .byte   5
-       .byte   6
-       .byte   7
-       .byte   8
-       .byte   9
-       .byte   10
-       .byte   11
-       .byte   28
-       .byte   29
-       .byte   30
-       .byte   31
+.LCPI3_3:
+       .word   1779033703
+       .word   3144134277
+       .word   1013904242
+       .word   2773480762
        .text
-       .globl  zfs_blake3_compress_xof_sse41
+       .globl  zfs_blake3_hash_many_sse41
        .p2align        2
-       .type   zfs_blake3_compress_xof_sse41,@function
-zfs_blake3_compress_xof_sse41:
+       .type   zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
        .cfi_startproc
-       ldp     q7, q6, [x0]
-       ldp     q17, q18, [x1]
-       add     x12, x1, #32
-       ld2     { v4.4s, v5.4s }, [x12]
-       lsr     x10, x3, #32
-       fmov    s16, w3
-       adrp    x13, .LCPI1_0
-       adrp    x11, .LCPI1_1
-       and     w8, w2, #0xff
-       mov     v16.s[1], w10
-       ldr     q0, [x13, :lo12:.LCPI1_0]
-       ldr     q20, [x11, :lo12:.LCPI1_1]
-       adrp    x11, .LCPI1_4
-       and     w9, w4, #0xff
-       ldr     q2, [x11, :lo12:.LCPI1_4]
-       mov     v16.s[2], w8
-       uzp1    v21.4s, v17.4s, v18.4s
-       add     v7.4s, v6.4s, v7.4s
-       adrp    x12, .LCPI1_3
-       mov     v16.s[3], w9
-       uzp2    v18.4s, v17.4s, v18.4s
-       add     v7.4s, v7.4s, v21.4s
-       ext     v17.16b, v5.16b, v5.16b, #12
-       ldr     q3, [x12, :lo12:.LCPI1_3]
-       ext     v24.16b, v4.16b, v4.16b, #12
-       eor     v16.16b, v7.16b, v16.16b
-       mov     v27.16b, v17.16b
-       uzp1    v19.4s, v21.4s, v21.4s
-       ext     v25.16b, v21.16b, v21.16b, #12
-       zip2    v28.4s, v18.4s, v17.4s
-       tbl     v29.16b, { v16.16b }, v0.16b
-       mov     v27.s[1], v24.s[2]
-       zip1    v23.2d, v17.2d, v18.2d
-       ext     v19.16b, v19.16b, v21.16b, #8
-       add     v22.4s, v29.4s, v20.4s
-       ext     v26.16b, v21.16b, v25.16b, #12
-       tbl     v20.16b, { v23.16b, v24.16b }, v2.16b
-       zip1    v21.4s, v28.4s, v24.4s
-       zip1    v23.4s, v24.4s, v28.4s
-       uzp2    v19.4s, v19.4s, v18.4s
-       eor     v24.16b, v22.16b, v6.16b
-       ext     v25.16b, v20.16b, v20.16b, #12
-       ext     v6.16b, v23.16b, v21.16b, #8
-       add     v7.4s, v7.4s, v18.4s
-       ext     v18.16b, v19.16b, v19.16b, #4
-       tbl     v16.16b, { v26.16b, v27.16b }, v3.16b
-       uzp1    v21.4s, v20.4s, v25.4s
-       mov     v26.16b, v6.16b
-       ext     v23.16b, v18.16b, v18.16b, #12
-       mov     v26.s[1], v21.s[2]
-       adrp    x10, .LCPI1_2
-       ext     v25.16b, v18.16b, v23.16b, #12
-       uzp1    v23.4s, v18.4s, v18.4s
-       ldr     q1, [x10, :lo12:.LCPI1_2]
-       ext     v18.16b, v23.16b, v18.16b, #8
-       ushr    v23.4s, v24.4s, #12
-       shl     v24.4s, v24.4s, #20
-       orr     v23.16b, v24.16b, v23.16b
-       add     v7.4s, v7.4s, v23.4s
-       eor     v27.16b, v29.16b, v7.16b
-       add     v4.4s, v7.4s, v4.4s
-       tbl     v7.16b, { v25.16b, v26.16b }, v3.16b
-       tbl     v26.16b, { v27.16b }, v1.16b
-       add     v22.4s, v22.4s, v26.4s
-       uzp2    v18.4s, v18.4s, v16.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ext     v5.16b, v18.16b, v18.16b, #4
-       ushr    v27.4s, v23.4s, #7
-       shl     v23.4s, v23.4s, #25
-       uzp1    v25.4s, v5.4s, v5.4s
-       orr     v23.16b, v23.16b, v27.16b
-       ext     v28.16b, v4.16b, v4.16b, #12
-       ext     v4.16b, v25.16b, v5.16b, #8
-       ext     v25.16b, v26.16b, v26.16b, #8
-       add     v26.4s, v28.4s, v23.4s
-       eor     v25.16b, v26.16b, v25.16b
-       ext     v22.16b, v22.16b, v22.16b, #4
-       tbl     v25.16b, { v25.16b }, v0.16b
-       add     v22.4s, v22.4s, v25.4s
-       eor     v23.16b, v23.16b, v22.16b
-       add     v17.4s, v26.4s, v17.4s
-       ushr    v26.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       orr     v23.16b, v23.16b, v26.16b
-       add     v17.4s, v17.4s, v23.4s
-       eor     v25.16b, v25.16b, v17.16b
-       add     v17.4s, v17.4s, v19.4s
-       tbl     v19.16b, { v25.16b }, v1.16b
-       add     v22.4s, v22.4s, v19.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ushr    v25.4s, v23.4s, #7
-       shl     v23.4s, v23.4s, #25
-       ext     v17.16b, v17.16b, v17.16b, #4
-       orr     v23.16b, v23.16b, v25.16b
-       ext     v19.16b, v19.16b, v19.16b, #8
-       add     v17.4s, v17.4s, v23.4s
-       eor     v19.16b, v17.16b, v19.16b
-       ext     v22.16b, v22.16b, v22.16b, #12
-       tbl     v19.16b, { v19.16b }, v0.16b
-       add     v22.4s, v22.4s, v19.4s
-       eor     v23.16b, v23.16b, v22.16b
-       ushr    v25.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       add     v17.4s, v17.4s, v16.4s
-       orr     v23.16b, v23.16b, v25.16b
-       add     v17.4s, v17.4s, v23.4s
-       ext     v25.16b, v17.16b, v17.16b, #12
-       eor     v17.16b, v19.16b, v17.16b
-       tbl     v17.16b, { v17.16b }, v1.16b
-       add     v19.4s, v22.4s, v17.4s
-       eor     v22.16b, v23.16b, v19.16b
-       add     v25.4s, v25.4s, v21.4s
-       zip1    v20.2d, v6.2d, v16.2d
-       ushr    v23.4s, v22.4s, #7
-       shl     v22.4s, v22.4s, #25
-       zip2    v24.4s, v16.4s, v6.4s
-       tbl     v26.16b, { v20.16b, v21.16b }, v2.16b
-       orr     v22.16b, v22.16b, v23.16b
-       zip1    v16.4s, v24.4s, v21.4s
-       zip1    v20.4s, v21.4s, v24.4s
-       ext     v21.16b, v26.16b, v26.16b, #12
-       ext     v17.16b, v17.16b, v17.16b, #8
-       add     v25.4s, v25.4s, v22.4s
-       ext     v16.16b, v20.16b, v16.16b, #8
-       uzp1    v21.4s, v26.4s, v21.4s
-       eor     v26.16b, v25.16b, v17.16b
-       ext     v19.16b, v19.16b, v19.16b, #4
-       tbl     v26.16b, { v26.16b }, v0.16b
-       mov     v29.16b, v16.16b
-       add     v19.4s, v19.4s, v26.4s
-       ext     v27.16b, v5.16b, v5.16b, #12
-       mov     v29.s[1], v21.s[2]
-       eor     v22.16b, v22.16b, v19.16b
-       ext     v28.16b, v5.16b, v27.16b, #12
-       ushr    v27.4s, v22.4s, #12
-       shl     v22.4s, v22.4s, #20
-       add     v6.4s, v25.4s, v6.4s
-       orr     v22.16b, v22.16b, v27.16b
-       add     v6.4s, v6.4s, v22.4s
-       eor     v26.16b, v26.16b, v6.16b
-       add     v6.4s, v6.4s, v18.4s
-       tbl     v18.16b, { v26.16b }, v1.16b
-       add     v19.4s, v19.4s, v18.4s
-       eor     v22.16b, v22.16b, v19.16b
-       ushr    v26.4s, v22.4s, #7
-       shl     v22.4s, v22.4s, #25
-       ext     v6.16b, v6.16b, v6.16b, #4
-       orr     v22.16b, v22.16b, v26.16b
-       ext     v18.16b, v18.16b, v18.16b, #8
-       add     v6.4s, v6.4s, v22.4s
-       eor     v18.16b, v6.16b, v18.16b
-       ext     v19.16b, v19.16b, v19.16b, #12
-       tbl     v18.16b, { v18.16b }, v0.16b
-       add     v19.4s, v19.4s, v18.4s
-       eor     v22.16b, v22.16b, v19.16b
-       ushr    v26.4s, v22.4s, #12
-       shl     v22.4s, v22.4s, #20
-       add     v6.4s, v6.4s, v7.4s
-       orr     v22.16b, v22.16b, v26.16b
-       add     v6.4s, v6.4s, v22.4s
-       ext     v26.16b, v6.16b, v6.16b, #12
-       eor     v6.16b, v18.16b, v6.16b
-       uzp2    v4.4s, v4.4s, v7.4s
-       zip2    v25.4s, v7.4s, v16.4s
-       add     v26.4s, v26.4s, v21.4s
-       zip1    v20.2d, v16.2d, v7.2d
-       tbl     v6.16b, { v6.16b }, v1.16b
-       ext     v24.16b, v4.16b, v4.16b, #4
-       tbl     v27.16b, { v20.16b, v21.16b }, v2.16b
-       zip1    v7.4s, v25.4s, v21.4s
-       zip1    v20.4s, v21.4s, v25.4s
-       add     v18.4s, v19.4s, v6.4s
-       uzp1    v5.4s, v24.4s, v24.4s
-       ext     v21.16b, v27.16b, v27.16b, #12
-       ext     v7.16b, v20.16b, v7.16b, #8
-       eor     v19.16b, v22.16b, v18.16b
-       ext     v5.16b, v5.16b, v24.16b, #8
-       tbl     v17.16b, { v28.16b, v29.16b }, v3.16b
-       uzp1    v21.4s, v27.4s, v21.4s
-       mov     v28.16b, v7.16b
-       ushr    v22.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v23.16b, v24.16b, v24.16b, #12
-       uzp2    v5.4s, v5.4s, v17.4s
-       mov     v28.s[1], v21.s[2]
-       orr     v19.16b, v19.16b, v22.16b
-       ext     v27.16b, v24.16b, v23.16b, #12
-       ext     v23.16b, v5.16b, v5.16b, #4
-       ext     v6.16b, v6.16b, v6.16b, #8
-       ext     v25.16b, v18.16b, v18.16b, #4
-       add     v18.4s, v26.4s, v19.4s
-       uzp1    v24.4s, v23.4s, v23.4s
-       eor     v6.16b, v18.16b, v6.16b
-       ext     v24.16b, v24.16b, v23.16b, #8
-       add     v16.4s, v18.4s, v16.4s
-       tbl     v18.16b, { v27.16b, v28.16b }, v3.16b
-       tbl     v27.16b, { v6.16b }, v0.16b
-       uzp2    v6.4s, v24.4s, v18.4s
-       add     v24.4s, v25.4s, v27.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v19.16b, v19.16b, v25.16b
-       add     v16.4s, v16.4s, v19.4s
-       eor     v25.16b, v27.16b, v16.16b
-       add     v4.4s, v16.4s, v4.4s
-       tbl     v16.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v16.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v19.16b, v19.16b, v25.16b
-       ext     v16.16b, v16.16b, v16.16b, #8
-       add     v4.4s, v4.4s, v19.4s
-       eor     v16.16b, v4.16b, v16.16b
-       ext     v24.16b, v24.16b, v24.16b, #12
-       tbl     v25.16b, { v16.16b }, v0.16b
-       add     v24.4s, v24.4s, v25.4s
-       eor     v16.16b, v19.16b, v24.16b
-       ushr    v19.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       add     v4.4s, v4.4s, v17.4s
-       orr     v19.16b, v16.16b, v19.16b
-       add     v27.4s, v4.4s, v19.4s
-       eor     v25.16b, v25.16b, v27.16b
-       tbl     v25.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v25.4s
-       zip2    v26.4s, v17.4s, v7.4s
-       ext     v4.16b, v27.16b, v27.16b, #12
-       eor     v19.16b, v19.16b, v24.16b
-       add     v28.4s, v4.4s, v21.4s
-       zip1    v20.2d, v7.2d, v17.2d
-       zip1    v4.4s, v26.4s, v21.4s
-       zip1    v17.4s, v21.4s, v26.4s
-       ushr    v26.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v19.16b, v19.16b, v26.16b
-       ext     v25.16b, v25.16b, v25.16b, #8
-       add     v27.4s, v28.4s, v19.4s
-       eor     v25.16b, v27.16b, v25.16b
-       ext     v24.16b, v24.16b, v24.16b, #4
-       tbl     v25.16b, { v25.16b }, v0.16b
-       add     v24.4s, v24.4s, v25.4s
-       eor     v19.16b, v19.16b, v24.16b
-       add     v7.4s, v27.4s, v7.4s
-       ushr    v27.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v19.16b, v19.16b, v27.16b
-       add     v7.4s, v7.4s, v19.4s
-       eor     v25.16b, v25.16b, v7.16b
-       add     v5.4s, v7.4s, v5.4s
-       tbl     v7.16b, { v25.16b }, v1.16b
-       add     v24.4s, v24.4s, v7.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       ext     v5.16b, v5.16b, v5.16b, #4
-       orr     v19.16b, v19.16b, v25.16b
-       ext     v7.16b, v7.16b, v7.16b, #8
-       add     v5.4s, v5.4s, v19.4s
-       eor     v7.16b, v5.16b, v7.16b
-       ext     v24.16b, v24.16b, v24.16b, #12
-       tbl     v7.16b, { v7.16b }, v0.16b
-       add     v24.4s, v24.4s, v7.4s
-       eor     v19.16b, v19.16b, v24.16b
-       ushr    v25.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       tbl     v16.16b, { v20.16b, v21.16b }, v2.16b
-       add     v5.4s, v5.4s, v18.4s
-       orr     v19.16b, v19.16b, v25.16b
-       ext     v20.16b, v16.16b, v16.16b, #12
-       ext     v4.16b, v17.16b, v4.16b, #8
-       add     v5.4s, v5.4s, v19.4s
-       uzp1    v21.4s, v16.4s, v20.4s
-       mov     v17.16b, v4.16b
-       ext     v25.16b, v5.16b, v5.16b, #12
-       mov     v17.s[1], v21.s[2]
-       add     v25.4s, v25.4s, v21.4s
-       zip1    v20.2d, v4.2d, v18.2d
-       ext     v22.16b, v23.16b, v23.16b, #12
-       zip2    v26.4s, v18.4s, v4.4s
-       tbl     v18.16b, { v20.16b, v21.16b }, v2.16b
-       eor     v5.16b, v7.16b, v5.16b
-       ext     v16.16b, v23.16b, v22.16b, #12
-       ext     v22.16b, v6.16b, v6.16b, #4
-       zip1    v27.4s, v26.4s, v21.4s
-       zip1    v20.4s, v21.4s, v26.4s
-       ext     v21.16b, v18.16b, v18.16b, #12
-       tbl     v5.16b, { v5.16b }, v1.16b
-       ext     v20.16b, v20.16b, v27.16b, #8
-       uzp1    v27.4s, v18.4s, v21.4s
-       uzp1    v18.4s, v22.4s, v22.4s
-       add     v21.4s, v24.4s, v5.4s
-       ext     v18.16b, v18.16b, v22.16b, #8
-       eor     v19.16b, v19.16b, v21.16b
-       tbl     v7.16b, { v16.16b, v17.16b }, v3.16b
-       uzp2    v18.4s, v18.4s, v17.4s
-       zip2    v16.4s, v16.4s, v20.4s
-       ushr    v17.4s, v19.4s, #7
-       shl     v19.4s, v19.4s, #25
-       orr     v17.16b, v19.16b, v17.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v19.4s, v25.4s, v17.4s
-       eor     v5.16b, v19.16b, v5.16b
-       ext     v21.16b, v21.16b, v21.16b, #4
-       tbl     v5.16b, { v5.16b }, v0.16b
-       add     v4.4s, v19.4s, v4.4s
-       add     v19.4s, v21.4s, v5.4s
-       eor     v17.16b, v17.16b, v19.16b
-       ushr    v21.4s, v17.4s, #12
-       shl     v17.4s, v17.4s, #20
-       orr     v17.16b, v17.16b, v21.16b
-       add     v4.4s, v4.4s, v17.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
-       add     v4.4s, v4.4s, v6.4s
-       add     v6.4s, v19.4s, v5.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ushr    v19.4s, v17.4s, #7
-       shl     v17.4s, v17.4s, #25
-       ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v17.16b, v17.16b, v19.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v4.4s, v17.4s
-       eor     v5.16b, v4.16b, v5.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       tbl     v5.16b, { v5.16b }, v0.16b
-       add     v6.4s, v6.4s, v5.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ushr    v19.4s, v17.4s, #12
-       shl     v17.4s, v17.4s, #20
-       add     v4.4s, v4.4s, v7.4s
-       orr     v17.16b, v17.16b, v19.16b
-       add     v4.4s, v4.4s, v17.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
-       mov     v29.16b, v20.16b
-       ext     v4.16b, v4.16b, v4.16b, #12
-       add     v6.4s, v6.4s, v5.4s
-       mov     v29.s[1], v27.s[2]
-       add     v4.4s, v4.4s, v27.4s
-       zip1    v26.2d, v20.2d, v7.2d
-       zip1    v7.4s, v16.4s, v27.4s
-       zip1    v16.4s, v27.4s, v16.4s
-       eor     v17.16b, v17.16b, v6.16b
-       ext     v7.16b, v16.16b, v7.16b, #8
-       ushr    v16.4s, v17.4s, #7
-       shl     v17.4s, v17.4s, #25
-       orr     v16.16b, v17.16b, v16.16b
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v4.4s, v16.4s
-       eor     v5.16b, v4.16b, v5.16b
-       ext     v6.16b, v6.16b, v6.16b, #4
-       tbl     v5.16b, { v5.16b }, v0.16b
-       add     v6.4s, v6.4s, v5.4s
-       eor     v16.16b, v16.16b, v6.16b
-       ushr    v17.4s, v16.4s, #12
-       shl     v16.4s, v16.4s, #20
-       add     v4.4s, v4.4s, v20.4s
-       orr     v16.16b, v16.16b, v17.16b
-       add     v4.4s, v4.4s, v16.4s
-       eor     v5.16b, v5.16b, v4.16b
-       tbl     v5.16b, { v5.16b }, v1.16b
-       add     v6.4s, v6.4s, v5.4s
-       eor     v16.16b, v16.16b, v6.16b
-       add     v4.4s, v4.4s, v18.4s
-       ushr    v17.4s, v16.4s, #7
-       shl     v16.4s, v16.4s, #25
-       ext     v23.16b, v22.16b, v22.16b, #12
-       ext     v4.16b, v4.16b, v4.16b, #4
-       orr     v16.16b, v16.16b, v17.16b
-       ext     v28.16b, v22.16b, v23.16b, #12
-       ext     v5.16b, v5.16b, v5.16b, #8
-       add     v4.4s, v16.4s, v4.4s
-       tbl     v3.16b, { v28.16b, v29.16b }, v3.16b
-       eor     v5.16b, v4.16b, v5.16b
-       ext     v6.16b, v6.16b, v6.16b, #12
-       add     v3.4s, v4.4s, v3.4s
-       tbl     v4.16b, { v5.16b }, v0.16b
-       add     v5.4s, v6.4s, v4.4s
-       eor     v6.16b, v16.16b, v5.16b
-       ushr    v16.4s, v6.4s, #12
-       shl     v6.4s, v6.4s, #20
-       orr     v6.16b, v6.16b, v16.16b
-       tbl     v2.16b, { v26.16b, v27.16b }, v2.16b
-       add     v3.4s, v3.4s, v6.4s
-       ext     v19.16b, v2.16b, v2.16b, #12
-       eor     v4.16b, v4.16b, v3.16b
-       uzp1    v2.4s, v2.4s, v19.4s
-       ext     v3.16b, v3.16b, v3.16b, #12
-       tbl     v4.16b, { v4.16b }, v1.16b
-       add     v2.4s, v3.4s, v2.4s
-       add     v3.4s, v5.4s, v4.4s
-       eor     v5.16b, v6.16b, v3.16b
-       ushr    v6.4s, v5.4s, #7
-       shl     v5.4s, v5.4s, #25
-       orr     v5.16b, v5.16b, v6.16b
-       ext     v4.16b, v4.16b, v4.16b, #8
-       add     v2.4s, v2.4s, v5.4s
-       eor     v4.16b, v2.16b, v4.16b
-       ext     v3.16b, v3.16b, v3.16b, #4
-       tbl     v0.16b, { v4.16b }, v0.16b
-       add     v3.4s, v3.4s, v0.4s
-       eor     v4.16b, v5.16b, v3.16b
-       ushr    v5.4s, v4.4s, #12
-       shl     v4.4s, v4.4s, #20
-       add     v2.4s, v2.4s, v7.4s
-       orr     v4.16b, v4.16b, v5.16b
-       add     v2.4s, v2.4s, v4.4s
-       eor     v0.16b, v0.16b, v2.16b
-       tbl     v0.16b, { v0.16b }, v1.16b
-       add     v1.4s, v3.4s, v0.4s
-       eor     v3.16b, v4.16b, v1.16b
-       ushr    v4.4s, v3.4s, #7
-       shl     v3.4s, v3.4s, #25
-       ext     v2.16b, v2.16b, v2.16b, #4
-       ext     v0.16b, v0.16b, v0.16b, #8
-       ext     v1.16b, v1.16b, v1.16b, #12
-       orr     v3.16b, v3.16b, v4.16b
-       eor     v2.16b, v2.16b, v1.16b
-       eor     v3.16b, v3.16b, v0.16b
-       stp     q2, q3, [x5]
-       ldr     q2, [x0]
-       eor     v1.16b, v2.16b, v1.16b
-       str     q1, [x5, #32]
-       ldr     q1, [x0, #16]
-       eor     v0.16b, v1.16b, v0.16b
-       str     q0, [x5, #48]
-       ret
-.Lfunc_end1:
-       .size   zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
-       .cfi_endproc
-
-       .section        .rodata.cst16,"aM",@progbits,16
-       .p2align        4
-.LCPI2_0:
-       .word   0
-       .word   1
-       .word   2
-       .word   3
-.LCPI2_1:
-       .byte   2
-       .byte   3
-       .byte   0
-       .byte   1
-       .byte   6
-       .byte   7
-       .byte   4
-       .byte   5
-       .byte   10
-       .byte   11
-       .byte   8
-       .byte   9
-       .byte   14
-       .byte   15
-       .byte   12
-       .byte   13
-.LCPI2_2:
-       .byte   1
-       .byte   2
-       .byte   3
-       .byte   0
-       .byte   5
-       .byte   6
-       .byte   7
-       .byte   4
-       .byte   9
-       .byte   10
-       .byte   11
-       .byte   8
-       .byte   13
-       .byte   14
-       .byte   15
-       .byte   12
-       .text
-       .globl  zfs_blake3_hash_many_sse41
-       .p2align        2
-       .type   zfs_blake3_hash_many_sse41,@function
-zfs_blake3_hash_many_sse41:
-       .cfi_startproc
-       stp     d15, d14, [sp, #-160]!
+       hint    #34
+       stp     d15, d14, [sp, #-144]!
        stp     d13, d12, [sp, #16]
        stp     d11, d10, [sp, #32]
        stp     d9, d8, [sp, #48]
-       stp     x29, x30, [sp, #64]
-       stp     x28, x27, [sp, #80]
-       stp     x26, x25, [sp, #96]
-       stp     x24, x23, [sp, #112]
-       stp     x22, x21, [sp, #128]
-       stp     x20, x19, [sp, #144]
-       mov     x29, sp
-       sub     sp, sp, #448
-       .cfi_def_cfa w29, 160
+       stp     x29, x27, [sp, #64]
+       stp     x26, x25, [sp, #80]
+       stp     x24, x23, [sp, #96]
+       stp     x22, x21, [sp, #112]
+       stp     x20, x19, [sp, #128]
+       sub     sp, sp, #368
+       .cfi_def_cfa_offset 512
        .cfi_offset w19, -8
        .cfi_offset w20, -16
        .cfi_offset w21, -24
@@ -1123,1341 +677,1722 @@ zfs_blake3_hash_many_sse41:
        .cfi_offset w25, -56
        .cfi_offset w26, -64
        .cfi_offset w27, -72
-       .cfi_offset w28, -80
-       .cfi_offset w30, -88
-       .cfi_offset w29, -96
-       .cfi_offset b8, -104
-       .cfi_offset b9, -112
-       .cfi_offset b10, -120
-       .cfi_offset b11, -128
-       .cfi_offset b12, -136
-       .cfi_offset b13, -144
-       .cfi_offset b14, -152
-       .cfi_offset b15, -160
-       ldr     x26, [x29, #168]
-       ldrb    w27, [x29, #160]
-       mov     w19, w6
-       mov     x20, x4
-       mov     x22, x2
-       mov     x28, x1
+       .cfi_offset w29, -80
+       .cfi_offset b8, -88
+       .cfi_offset b9, -96
+       .cfi_offset b10, -104
+       .cfi_offset b11, -112
+       .cfi_offset b12, -120
+       .cfi_offset b13, -128
+       .cfi_offset b14, -136
+       .cfi_offset b15, -144
+       ldr     x8, [sp, #520]
+       adrp    x11, .LCPI3_1
+       ldrb    w9, [sp, #512]
+       adrp    x10, .LCPI3_2
        cmp     x1, #4
-       mov     x24, x0
-       str     x3, [sp, #40]
-       b.lo    .LBB2_8
-       adrp    x11, .LCPI2_0
-       ldr     q0, [x11, :lo12:.LCPI2_0]
+       b.lo    .LBB3_6
+       adrp    x12, .LCPI3_0
        sbfx    w13, w5, #0, #1
+       mov     w15, #58983
+       mov     w16, #44677
+       movk    w15, #27145, lsl #16
+       movk    w16, #47975, lsl #16
+       ldr     q0, [x12, :lo12:.LCPI3_0]
        dup     v1.4s, w13
-       mov     w10, #58983
-       mov     w11, #44677
-       mov     w12, #62322
+       movi    v13.4s, #64
+       mov     w13, #62322
+       mov     w14, #62778
+       orr     w12, w7, w6
        and     v0.16b, v1.16b, v0.16b
-       mov     w13, #62778
-       orr     w8, w7, w19
-       adrp    x9, .LCPI2_1
-       movk    w10, #27145, lsl #16
-       movk    w11, #47975, lsl #16
-       movk    w12, #15470, lsl #16
-       movk    w13, #42319, lsl #16
-       str     q0, [sp, #16]
+       ldr     q1, [x11, :lo12:.LCPI3_1]
+       movk    w13, #15470, lsl #16
+       movk    w14, #42319, lsl #16
+       dup     v14.4s, w15
+       stp     q0, q1, [sp, #16]
        orr     v0.4s, #128, lsl #24
-       adrp    x14, .LCPI2_2
        str     q0, [sp]
-.LBB2_2:
-       ldr     x2, [sp, #40]
-       mov     x15, x2
-       ld1r    { v7.4s }, [x15], #4
-       add     x16, x2, #8
-       add     x17, x2, #12
-       add     x18, x2, #16
-       add     x0, x2, #20
-       add     x3, x2, #24
-       add     x2, x2, #28
-       ld1r    { v6.4s }, [x16]
-       ld1r    { v17.4s }, [x17]
-       ld1r    { v10.4s }, [x18]
-       ld1r    { v11.4s }, [x0]
-       ld1r    { v19.4s }, [x3]
-       ld1r    { v18.4s }, [x15]
-       ld1r    { v16.4s }, [x2]
-       cbz     x22, .LBB2_7
+       dup     v0.4s, w16
+       stp     q0, q14, [sp, #48]
+       b       .LBB3_3
+.LBB3_2:
+       zip1    v0.4s, v29.4s, v8.4s
+       add     x15, x4, #4
+       zip1    v1.4s, v30.4s, v31.4s
+       tst     w5, #0x1
+       zip1    v2.4s, v24.4s, v18.4s
+       csel    x4, x15, x4, ne
+       zip1    v3.4s, v25.4s, v26.4s
+       add     x0, x0, #32
+       zip2    v6.4s, v29.4s, v8.4s
+       sub     x1, x1, #4
+       zip1    v4.2d, v0.2d, v1.2d
+       cmp     x1, #3
+       zip2    v7.4s, v30.4s, v31.4s
+       zip1    v5.2d, v2.2d, v3.2d
+       zip2    v0.2d, v0.2d, v1.2d
+       zip2    v1.2d, v2.2d, v3.2d
+       zip2    v2.4s, v24.4s, v18.4s
+       zip2    v3.4s, v25.4s, v26.4s
+       stp     q4, q5, [x8]
+       zip2    v4.2d, v6.2d, v7.2d
+       stp     q0, q1, [x8, #32]
+       zip1    v0.2d, v6.2d, v7.2d
+       zip1    v1.2d, v2.2d, v3.2d
+       zip2    v2.2d, v2.2d, v3.2d
+       stp     q0, q1, [x8, #64]
+       stp     q4, q2, [x8, #96]
+       add     x8, x8, #128
+       b.ls    .LBB3_6
+.LBB3_3:
+       mov     x15, x3
+       add     x16, x3, #8
+       add     x17, x3, #12
+       add     x19, x3, #16
+       add     x20, x3, #20
+       ld1r    { v29.4s }, [x15], #4
+       ld1r    { v30.4s }, [x16]
+       add     x16, x3, #24
+       ld1r    { v31.4s }, [x17]
+       add     x17, x3, #28
+       ld1r    { v24.4s }, [x19]
+       ld1r    { v18.4s }, [x20]
+       ld1r    { v25.4s }, [x16]
+       ld1r    { v8.4s }, [x15]
+       ld1r    { v26.4s }, [x17]
+       cbz     x2, .LBB3_2
        ldr     q1, [sp, #16]
-       dup     v0.4s, w20
-       ldp     x15, x16, [x24]
-       ldp     x17, x18, [x24, #16]
+       dup     v0.4s, w4
+       lsr     x17, x4, #32
+       mov     x15, xzr
+       ldp     x19, x20, [x0, #16]
        add     v1.4s, v0.4s, v1.4s
+       mov     x21, x2
        movi    v0.4s, #128, lsl #24
-       str     q1, [sp, #64]
+       mov     w26, w12
+       str     q1, [sp, #96]
        eor     v0.16b, v1.16b, v0.16b
        ldr     q1, [sp]
-       lsr     x2, x20, #32
-       mov     x0, xzr
-       mov     w6, w8
        cmgt    v0.4s, v1.4s, v0.4s
-       dup     v1.4s, w2
+       dup     v1.4s, w17
+       ldp     x16, x17, [x0]
        sub     v0.4s, v1.4s, v0.4s
-       str     q0, [sp, #48]
-.LBB2_4:
-       mov     w4, #16
-       stp     q16, q17, [sp, #192]
-       bfi     x4, x0, #6, #58
-       ldr     q1, [x15, x4]
-       ldr     q3, [x16, x4]
-       ldr     q2, [x17, x4]
-       ldr     q4, [x18, x4]
-       mov     w4, #32
-       bfi     x4, x0, #6, #58
-       ldr     q5, [x15, x4]
-       ldr     q20, [x16, x4]
-       ldr     q21, [x17, x4]
-       ldr     q22, [x18, x4]
-       mov     w4, #48
-       lsl     x3, x0, #6
-       bfi     x4, x0, #6, #58
-       add     x0, x0, #1
-       ldr     q0, [x15, x3]
-       ldr     q23, [x16, x3]
-       ldr     q16, [x17, x3]
-       ldr     q17, [x18, x3]
-       cmp     x0, x22
-       ldr     q25, [x15, x4]
-       ldr     q14, [x16, x4]
-       ldr     q28, [x17, x4]
-       ldr     q31, [x18, x4]
-       csel    w4, w27, wzr, eq
-       orr     w4, w4, w6
-       mov     x2, xzr
-       and     w6, w4, #0xff
-       add     x3, x3, #256
-.LBB2_5:
-       ldr     x4, [x24, x2]
-       add     x2, x2, #8
-       cmp     x2, #32
-       add     x4, x4, x3
-       prfm    pldl1keep, [x4]
-       b.ne    .LBB2_5
-       zip1    v29.4s, v0.4s, v23.4s
-       zip2    v23.4s, v0.4s, v23.4s
-       zip1    v0.4s, v16.4s, v17.4s
-       zip2    v24.4s, v16.4s, v17.4s
-       zip1    v9.4s, v1.4s, v3.4s
-       zip2    v26.4s, v1.4s, v3.4s
-       zip1    v27.4s, v2.4s, v4.4s
-       zip2    v17.4s, v2.4s, v4.4s
-       zip1    v12.4s, v21.4s, v22.4s
-       zip2    v13.4s, v21.4s, v22.4s
-       add     v2.4s, v7.4s, v10.4s
-       add     v1.4s, v18.4s, v11.4s
-       ext     v7.16b, v0.16b, v29.16b, #8
-       ext     v22.16b, v24.16b, v23.16b, #8
-       zip1    v30.4s, v5.4s, v20.4s
-       zip2    v20.4s, v5.4s, v20.4s
-       stp     q1, q2, [sp, #112]
-       ext     v2.16b, v29.16b, v7.16b, #8
-       mov     v29.d[1], v0.d[0]
-       ext     v18.16b, v23.16b, v22.16b, #8
-       mov     v23.d[1], v24.d[0]
-       zip1    v21.4s, v25.4s, v14.4s
-       zip2    v4.4s, v25.4s, v14.4s
-       zip1    v14.4s, v28.4s, v31.4s
-       zip2    v15.4s, v28.4s, v31.4s
-       add     v8.4s, v6.4s, v19.4s
-       ext     v28.16b, v27.16b, v9.16b, #8
-       ext     v31.16b, v17.16b, v26.16b, #8
-       stur    q2, [x29, #-208]
-       mov     v7.16b, v29.16b
-       ext     v0.16b, v12.16b, v30.16b, #8
-       stp     q23, q29, [x29, #-80]
-       mov     v2.16b, v19.16b
-       ext     v19.16b, v13.16b, v20.16b, #8
-       mov     v29.16b, v9.16b
-       ext     v25.16b, v9.16b, v28.16b, #8
-       mov     v29.d[1], v27.d[0]
-       ext     v24.16b, v26.16b, v31.16b, #8
-       mov     v26.d[1], v17.d[0]
-       ext     v17.16b, v15.16b, v4.16b, #8
-       ext     v27.16b, v30.16b, v0.16b, #8
-       ext     v0.16b, v20.16b, v19.16b, #8
-       stp     q0, q25, [sp, #80]
-       ext     v0.16b, v4.16b, v17.16b, #8
-       str     q0, [sp, #224]
-       ldr     q0, [sp, #128]
-       mov     v6.16b, v23.16b
-       mov     v22.16b, v4.16b
-       ldr     q16, [x9, :lo12:.LCPI2_1]
-       add     v17.4s, v0.4s, v7.4s
-       ldr     q0, [sp, #112]
-       mov     v30.d[1], v12.d[0]
-       add     v7.4s, v8.4s, v29.4s
-       mov     v20.d[1], v13.d[0]
-       add     v4.4s, v0.4s, v6.4s
-       ldr     q0, [sp, #64]
-       dup     v3.4s, w12
-       ext     v28.16b, v14.16b, v21.16b, #8
-       dup     v1.4s, w10
-       eor     v19.16b, v17.16b, v0.16b
-       ldr     q0, [sp, #48]
-       ext     v23.16b, v21.16b, v28.16b, #8
-       mov     v21.d[1], v14.d[0]
-       tbl     v14.16b, { v19.16b }, v16.16b
-       eor     v12.16b, v4.16b, v0.16b
-       movi    v0.4s, #64
-       eor     v13.16b, v7.16b, v0.16b
-       tbl     v13.16b, { v13.16b }, v16.16b
-       add     v6.4s, v13.4s, v3.4s
-       dup     v5.4s, w11
-       tbl     v12.16b, { v12.16b }, v16.16b
-       add     v1.4s, v14.4s, v1.4s
-       eor     v9.16b, v6.16b, v2.16b
-       ldp     q2, q0, [sp, #192]
-       add     v5.4s, v12.4s, v5.4s
-       eor     v19.16b, v1.16b, v10.16b
-       eor     v10.16b, v5.16b, v11.16b
-       ushr    v11.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v11.16b, v19.16b, v11.16b
-       ushr    v19.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       mov     v22.d[1], v15.d[0]
-       orr     v10.16b, v10.16b, v19.16b
-       ushr    v19.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       add     v15.4s, v0.4s, v2.4s
-       orr     v9.16b, v9.16b, v19.16b
-       dup     v19.4s, w6
-       add     v15.4s, v15.4s, v26.4s
-       eor     v19.16b, v15.16b, v19.16b
-       tbl     v3.16b, { v19.16b }, v16.16b
-       dup     v19.4s, w13
-       add     v8.4s, v3.4s, v19.4s
-       ldur    q31, [x29, #-208]
-       eor     v19.16b, v8.16b, v2.16b
-       ushr    v0.4s, v19.4s, #12
-       shl     v19.4s, v19.4s, #20
-       orr     v2.16b, v19.16b, v0.16b
-       ldr     q19, [x14, :lo12:.LCPI2_2]
-       add     v17.4s, v17.4s, v31.4s
-       add     v17.4s, v17.4s, v11.4s
-       eor     v14.16b, v14.16b, v17.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       add     v1.4s, v1.4s, v14.4s
-       eor     v11.16b, v1.16b, v11.16b
-       add     v4.4s, v4.4s, v18.4s
-       ushr    v0.4s, v11.4s, #7
-       shl     v11.4s, v11.4s, #25
-       add     v4.4s, v4.4s, v10.4s
-       orr     v0.16b, v11.16b, v0.16b
-       eor     v11.16b, v12.16b, v4.16b
-       tbl     v11.16b, { v11.16b }, v19.16b
-       add     v5.4s, v5.4s, v11.4s
-       eor     v10.16b, v5.16b, v10.16b
-       add     v7.4s, v7.4s, v25.4s
-       ushr    v12.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       add     v7.4s, v7.4s, v9.4s
-       orr     v10.16b, v10.16b, v12.16b
-       eor     v12.16b, v13.16b, v7.16b
-       tbl     v12.16b, { v12.16b }, v19.16b
-       add     v6.4s, v6.4s, v12.4s
-       eor     v9.16b, v6.16b, v9.16b
-       ushr    v13.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       orr     v9.16b, v9.16b, v13.16b
-       add     v13.4s, v15.4s, v24.4s
-       add     v13.4s, v13.4s, v2.4s
-       eor     v3.16b, v3.16b, v13.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       add     v8.4s, v8.4s, v3.4s
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v30.4s
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v20.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v21.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v22.4s
-       mov     v28.16b, v26.16b
-       stur    q26, [x29, #-112]
-       mov     v26.16b, v18.16b
-       mov     v18.16b, v24.16b
-       stur    q24, [x29, #-160]
-       add     v6.4s, v6.4s, v3.4s
-       mov     v24.16b, v20.16b
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldr     q20, [sp, #80]
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v13.16b
-       stp     q30, q22, [x29, #-192]
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       mov     v30.16b, v27.16b
-       add     v17.4s, v17.4s, v27.4s
-       ldr     q27, [sp, #224]
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v20.4s
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #12
-       shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v23.4s
-       orr     v0.16b, v0.16b, v15.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v27.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v13.16b
-       stur    q21, [x29, #-144]
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       ldur    q21, [x29, #-80]
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
+       str     q0, [sp, #80]
+.LBB3_5:
+       add     x23, x16, x15
+       add     x24, x17, x15
+       add     x22, x19, x15
+       add     x25, x20, x15
+       subs    x21, x21, #1
+       add     x15, x15, #64
+       ldp     q1, q2, [x23]
+       csel    w27, w9, wzr, eq
+       orr     w26, w27, w26
+       and     w26, w26, #0xff
+       ldp     q4, q5, [x24]
+       dup     v0.4s, w26
+       mov     w26, w6
+       zip1    v22.4s, v1.4s, v4.4s
+       zip2    v20.4s, v1.4s, v4.4s
+       ldp     q6, q7, [x22]
+       zip1    v17.4s, v2.4s, v5.4s
+       zip2    v23.4s, v2.4s, v5.4s
+       ldp     q16, q21, [x25]
+       zip1    v19.4s, v6.4s, v16.4s
+       zip2    v1.4s, v6.4s, v16.4s
+       ldp     q27, q28, [x23, #32]
+       zip1    v4.4s, v7.4s, v21.4s
+       zip2    v5.4s, v7.4s, v21.4s
+       zip2    v15.2d, v17.2d, v4.2d
+       ldp     q9, q10, [x24, #32]
+       mov     v17.d[1], v4.d[0]
+       add     v4.4s, v30.4s, v25.4s
+       zip2    v11.2d, v23.2d, v5.2d
+       zip2    v3.4s, v27.4s, v9.4s
+       zip1    v7.4s, v27.4s, v9.4s
+       ldp     q12, q6, [x22, #32]
+       mov     v23.d[1], v5.d[0]
+       stp     q11, q3, [sp, #256]
+       add     v5.4s, v31.4s, v26.4s
+       add     v4.4s, v4.4s, v17.4s
+       str     q23, [sp, #352]
+       ldp     q16, q2, [x25, #32]
+       add     v5.4s, v5.4s, v23.4s
+       zip1    v3.4s, v12.4s, v16.4s
        eor     v0.16b, v5.16b, v0.16b
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #7
-       shl     v0.4s, v0.4s, #25
-       orr     v0.16b, v0.16b, v15.16b
-       add     v17.4s, v17.4s, v21.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v26.4s
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v18.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v29.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
+       zip1    v9.4s, v6.4s, v2.4s
+       zip2    v2.4s, v6.4s, v2.4s
+       stp     q7, q3, [sp, #208]
+       zip2    v3.4s, v12.4s, v16.4s
+       zip1    v12.4s, v28.4s, v10.4s
+       zip2    v10.4s, v28.4s, v10.4s
+       stp     q17, q2, [sp, #160]
+       zip2    v28.2d, v22.2d, v19.2d
+       mov     v22.d[1], v19.d[0]
+       str     q3, [sp, #240]
+       add     v2.4s, v8.4s, v18.4s
+       eor     v16.16b, v4.16b, v13.16b
+       dup     v17.4s, w13
+       mov     v3.16b, v22.16b
+       stp     q22, q28, [sp, #320]
+       zip2    v22.2d, v20.2d, v1.2d
+       mov     v20.d[1], v1.d[0]
+       add     v1.4s, v29.4s, v24.4s
+       add     v4.4s, v4.4s, v15.4s
        add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ldur    q22, [x29, #-64]
-       ushr    v15.4s, v0.4s, #12
+       add     v2.4s, v2.4s, v20.4s
+       stp     q15, q20, [sp, #288]
+       add     v1.4s, v1.4s, v3.4s
+       ldr     q3, [sp, #96]
+       dup     v20.4s, w14
+       mov     v23.16b, v22.16b
+       mov     v15.16b, v10.16b
+       eor     v6.16b, v1.16b, v3.16b
+       ldr     q3, [sp, #80]
+       add     v1.4s, v1.4s, v28.4s
+       ldr     q28, [sp, #272]
+       str     q23, [sp, #128]
+       eor     v7.16b, v2.16b, v3.16b
+       ldp     q27, q3, [sp, #32]
+       add     v2.4s, v2.4s, v22.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v7.16b, { v7.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       tbl     v0.16b, { v0.16b }, v27.16b
+       add     v19.4s, v6.4s, v14.4s
+       add     v21.4s, v7.4s, v3.4s
+       add     v30.4s, v16.4s, v17.4s
+       add     v31.4s, v0.4s, v20.4s
+       eor     v24.16b, v19.16b, v24.16b
+       eor     v17.16b, v21.16b, v18.16b
+       ushr    v18.4s, v24.4s, #12
+       shl     v20.4s, v24.4s, #20
+       eor     v24.16b, v30.16b, v25.16b
+       eor     v25.16b, v31.16b, v26.16b
+       ushr    v26.4s, v17.4s, #12
+       shl     v17.4s, v17.4s, #20
+       ushr    v29.4s, v24.4s, #12
+       shl     v24.4s, v24.4s, #20
+       ushr    v8.4s, v25.4s, #12
+       shl     v25.4s, v25.4s, #20
+       orr     v3.16b, v20.16b, v18.16b
+       ldr     q18, [x10, :lo12:.LCPI3_2]
+       orr     v13.16b, v17.16b, v26.16b
+       orr     v24.16b, v24.16b, v29.16b
+       orr     v14.16b, v25.16b, v8.16b
+       add     v8.4s, v1.4s, v3.4s
+       add     v29.4s, v2.4s, v13.4s
+       add     v17.4s, v4.4s, v24.4s
+       add     v20.4s, v5.4s, v14.4s
+       eor     v1.16b, v6.16b, v8.16b
+       eor     v2.16b, v7.16b, v29.16b
+       eor     v4.16b, v16.16b, v17.16b
+       eor     v0.16b, v0.16b, v20.16b
+       tbl     v25.16b, { v1.16b }, v18.16b
+       tbl     v16.16b, { v2.16b }, v18.16b
+       tbl     v6.16b, { v4.16b }, v18.16b
+       tbl     v4.16b, { v0.16b }, v18.16b
+       add     v19.4s, v19.4s, v25.4s
+       add     v21.4s, v21.4s, v16.4s
+       add     v26.4s, v30.4s, v6.4s
+       add     v7.4s, v31.4s, v4.4s
+       eor     v0.16b, v19.16b, v3.16b
+       eor     v1.16b, v21.16b, v13.16b
+       eor     v2.16b, v26.16b, v24.16b
+       eor     v3.16b, v7.16b, v14.16b
+       ushr    v5.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v24.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ushr    v30.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       orr     v5.16b, v0.16b, v5.16b
+       orr     v0.16b, v1.16b, v24.16b
+       ushr    v31.4s, v3.4s, #7
+       orr     v2.16b, v2.16b, v30.16b
+       ldp     q24, q30, [sp, #208]
+       shl     v3.4s, v3.4s, #25
+       zip2    v14.2d, v12.2d, v9.2d
+       mov     v22.16b, v24.16b
+       orr     v1.16b, v3.16b, v31.16b
+       zip2    v3.2d, v24.2d, v30.2d
+       mov     v24.16b, v28.16b
+       mov     v22.d[1], v30.d[0]
+       ldr     q30, [sp, #240]
+       mov     v31.16b, v12.16b
+       stp     q22, q14, [sp, #224]
+       mov     v24.d[1], v30.d[0]
+       add     v12.4s, v8.4s, v22.4s
+       mov     v31.d[1], v9.d[0]
+       add     v22.4s, v29.4s, v24.4s
+       ldr     q29, [sp, #176]
+       zip2    v28.2d, v28.2d, v30.2d
+       mov     v9.16b, v24.16b
+       mov     v15.d[1], v29.d[0]
+       zip2    v8.2d, v10.2d, v29.2d
+       add     v10.4s, v12.4s, v0.4s
+       add     v22.4s, v22.4s, v2.4s
+       str     q9, [sp, #144]
+       add     v20.4s, v20.4s, v15.4s
+       add     v17.4s, v17.4s, v31.4s
+       stp     q3, q8, [sp, #192]
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       add     v17.4s, v17.4s, v28.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v24.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
+       ushr    v13.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v22.4s
-       orr     v2.16b, v2.16b, v15.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v23.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ldur    q22, [x29, #-144]
-       ushr    v15.4s, v0.4s, #7
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v10.4s, v10.4s, v3.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v22.4s, v22.4s, v28.4s
+       ushr    v12.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v13.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v8.4s
+       orr     v1.16b, v1.16b, v12.16b
+       add     v17.4s, v17.4s, v14.4s
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v31.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
+       ushr    v13.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v22.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v30.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v27.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldr     q27, [sp, #96]
-       mov     v21.16b, v26.16b
-       stur    q26, [x29, #-96]
-       mov     v28.16b, v31.16b
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ldp     q31, q26, [x29, #-192]
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       add     v17.4s, v17.4s, v20.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v27.4s
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #12
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v22.4s, v22.4s, v23.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v17.4s, v17.4s, v11.4s
+       mov     v30.16b, v28.16b
+       mov     v28.16b, v23.16b
+       ldr     q23, [sp, #304]
+       ushr    v12.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v22.4s, v22.4s, v0.4s
+       mov     v29.16b, v31.16b
+       ldr     q31, [sp, #160]
+       orr     v5.16b, v5.16b, v13.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v23.4s
+       orr     v1.16b, v1.16b, v12.16b
+       str     q29, [sp, #272]
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v31.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v27.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v26.4s
-       orr     v0.16b, v0.16b, v15.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v31.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       eor     v0.16b, v5.16b, v0.16b
-       mov     v18.16b, v24.16b
-       mov     v24.16b, v20.16b
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #7
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       eor     v5.16b, v5.16b, v19.16b
+       add     v22.4s, v22.4s, v24.4s
+       ldr     q24, [sp, #320]
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v17.4s, v24.4s
+       ldr     q24, [sp, #352]
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v24.4s
+       ldr     q24, [sp, #336]
+       orr     v1.16b, v1.16b, v13.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v14.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       ldur    q20, [x29, #-160]
-       orr     v0.16b, v0.16b, v15.16b
-       add     v17.4s, v17.4s, v21.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v18.4s
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v23.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v20.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ldur    q25, [x29, #-80]
-       ushr    v15.4s, v0.4s, #12
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       eor     v5.16b, v19.16b, v5.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       add     v10.4s, v10.4s, v24.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v22.4s, v22.4s, v29.4s
+       ushr    v13.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v8.4s
+       ldr     q8, [sp, #288]
+       orr     v1.16b, v1.16b, v13.16b
+       add     v17.4s, v17.4s, v3.4s
+       ldr     q3, [sp, #352]
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       add     v17.4s, v17.4s, v29.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v22.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
+       ushr    v13.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v10.4s, v10.4s, v30.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v22.4s, v22.4s, v8.4s
+       mov     v24.16b, v30.16b
+       mov     v30.16b, v15.16b
+       add     v17.4s, v17.4s, v15.4s
+       ldr     q15, [sp, #224]
+       ushr    v12.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       str     q30, [sp, #176]
+       orr     v5.16b, v5.16b, v13.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v15.4s
+       orr     v1.16b, v1.16b, v12.16b
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
        add     v7.4s, v7.4s, v25.4s
-       orr     v2.16b, v2.16b, v15.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v26.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       ldur    q25, [x29, #-112]
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ushr    v15.4s, v0.4s, #7
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v25.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
+       ushr    v13.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v30.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v24.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v31.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldur    q25, [x29, #-64]
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ldr     q31, [sp, #224]
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       add     v17.4s, v17.4s, v27.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v25.4s
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #12
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v22.4s, v22.4s, v9.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v17.4s, v17.4s, v14.4s
+       ushr    v12.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v13.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v28.4s
+       orr     v1.16b, v1.16b, v12.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v11.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v27.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v31.4s
-       orr     v0.16b, v0.16b, v15.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v28.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       eor     v0.16b, v5.16b, v0.16b
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #7
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       eor     v5.16b, v5.16b, v19.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       add     v22.4s, v22.4s, v29.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v17.4s, v23.4s
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v31.4s
+       orr     v1.16b, v1.16b, v13.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v30.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       orr     v0.16b, v0.16b, v15.16b
-       add     v17.4s, v17.4s, v18.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v22.4s
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v26.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v23.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       mov     v21.16b, v29.16b
-       stur    q29, [x29, #-128]
-       mov     v29.16b, v30.16b
-       mov     v30.16b, v27.16b
-       mov     v27.16b, v18.16b
-       str     q18, [sp, #176]
-       eor     v0.16b, v0.16b, v1.16b
-       mov     v18.16b, v22.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ldur    q22, [x29, #-96]
-       ushr    v15.4s, v0.4s, #12
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       eor     v5.16b, v19.16b, v5.16b
+       add     v10.4s, v10.4s, v3.4s
+       ldr     q3, [sp, #192]
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v22.4s, v22.4s, v3.4s
+       ushr    v13.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v15.4s
+       ldr     q15, [sp, #128]
+       orr     v1.16b, v1.16b, v13.16b
+       add     v17.4s, v17.4s, v24.4s
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       add     v17.4s, v17.4s, v20.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v29.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
+       ushr    v13.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v22.4s
-       orr     v2.16b, v2.16b, v15.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v31.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ushr    v15.4s, v0.4s, #7
+       eor     v5.16b, v21.16b, v5.16b
+       ldp     q23, q11, [sp, #320]
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v10.4s, v10.4s, v8.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v22.4s, v22.4s, v23.4s
+       ushr    v12.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       mov     v28.16b, v31.16b
+       mov     v31.16b, v8.16b
+       ldr     q8, [sp, #208]
+       orr     v5.16b, v5.16b, v13.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v11.4s
+       orr     v1.16b, v1.16b, v12.16b
+       add     v17.4s, v17.4s, v8.4s
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v21.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
+       ushr    v13.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v24.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v30.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v28.4s
-       add     v6.4s, v6.4s, v3.4s
-       mov     v22.16b, v24.16b
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldur    q24, [x29, #-80]
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       mov     v21.16b, v30.16b
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ldur    q30, [x29, #-192]
-       mov     v20.16b, v29.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       ldur    q29, [x29, #-112]
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       add     v17.4s, v17.4s, v25.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v24.4s
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #12
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v22.4s, v22.4s, v29.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v17.4s, v17.4s, v30.4s
+       ushr    v12.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v13.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v9.4s
+       orr     v1.16b, v1.16b, v12.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v14.4s
+       ldr     q14, [sp, #256]
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v27.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v30.4s
-       orr     v0.16b, v0.16b, v15.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v29.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       eor     v0.16b, v5.16b, v0.16b
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #7
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       eor     v5.16b, v5.16b, v19.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       add     v22.4s, v22.4s, v3.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v17.4s, v15.4s
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v14.4s
+       orr     v1.16b, v1.16b, v13.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v8.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       orr     v0.16b, v0.16b, v15.16b
-       add     v17.4s, v17.4s, v18.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v20.4s
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v31.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v26.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ushr    v15.4s, v0.4s, #12
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       eor     v5.16b, v19.16b, v5.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       add     v10.4s, v10.4s, v28.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v22.4s, v22.4s, v24.4s
+       ushr    v13.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v11.4s
+       ldr     q11, [sp, #304]
+       orr     v1.16b, v1.16b, v13.16b
+       add     v17.4s, v17.4s, v31.4s
+       ldr     q31, [sp, #224]
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       add     v17.4s, v17.4s, v23.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v22.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
+       ushr    v13.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v27.4s
-       orr     v2.16b, v2.16b, v15.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v30.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       ldur    q27, [x29, #-160]
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       eor     v3.16b, v3.16b, v13.16b
-       ushr    v15.4s, v0.4s, #7
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v10.4s, v10.4s, v23.4s
+       ldr     q23, [sp, #240]
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v22.4s, v22.4s, v11.4s
+       mov     v30.16b, v8.16b
+       mov     v8.16b, v24.16b
+       ldr     q24, [sp, #352]
+       ushr    v12.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v13.16b
+       str     q8, [sp, #112]
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v24.4s
+       orr     v1.16b, v1.16b, v12.16b
+       add     v17.4s, v17.4s, v31.4s
+       eor     v4.16b, v4.16b, v10.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       mov     v29.16b, v3.16b
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v27.4s
-       mov     v28.16b, v25.16b
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
+       ushr    v13.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v21.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v28.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v29.4s
-       mov     v25.16b, v31.16b
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldur    q31, [x29, #-96]
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ldur    q28, [x29, #-208]
-       mov     v18.16b, v20.16b
-       str     q20, [sp, #144]
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       ldur    q20, [x29, #-128]
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       add     v17.4s, v17.4s, v24.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v31.4s
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #12
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v22.4s, v22.4s, v29.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v17.4s, v17.4s, v30.4s
+       ldr     q30, [sp, #272]
+       ushr    v12.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v22.4s, v22.4s, v0.4s
+       mov     v3.16b, v28.16b
+       ldr     q28, [sp, #176]
+       orr     v5.16b, v5.16b, v13.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v30.4s
+       orr     v1.16b, v1.16b, v12.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v28.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v27.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v28.4s
-       orr     v0.16b, v0.16b, v15.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v13.4s, v13.4s, v20.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v0.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v13.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v1.16b, v2.16b
-       add     v5.4s, v5.4s, v12.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       eor     v0.16b, v5.16b, v0.16b
-       orr     v2.16b, v2.16b, v15.16b
-       ushr    v15.4s, v0.4s, #7
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       eor     v5.16b, v5.16b, v19.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       add     v22.4s, v22.4s, v8.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v17.4s, v9.4s
+       ldr     q9, [sp, #320]
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v22.4s, v22.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v17.4s, v17.4s, v2.4s
+       add     v10.4s, v10.4s, v23.4s
+       orr     v1.16b, v1.16b, v13.16b
+       eor     v16.16b, v16.16b, v22.16b
+       add     v20.4s, v20.4s, v31.4s
+       eor     v6.16b, v6.16b, v17.16b
+       add     v10.4s, v10.4s, v5.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       eor     v25.16b, v25.16b, v10.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v4.16b, v4.16b, v20.16b
+       add     v26.4s, v26.4s, v6.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v0.16b, v21.16b, v0.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v2.16b, v26.16b, v2.16b
+       add     v19.4s, v19.4s, v25.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       orr     v0.16b, v0.16b, v15.16b
-       add     v17.4s, v17.4s, v18.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v4.4s, v4.4s, v22.4s
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v30.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v25.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v16.16b
-       eor     v3.16b, v3.16b, v13.16b
-       add     v17.4s, v17.4s, v26.4s
-       mov     v26.16b, v21.16b
-       add     v4.4s, v4.4s, v21.4s
-       ldur    q21, [x29, #-144]
-       ushr    v15.4s, v0.4s, #12
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v13.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       eor     v5.16b, v19.16b, v5.16b
+       add     v10.4s, v10.4s, v14.4s
+       ldr     q14, [sp, #288]
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v7.16b, v1.16b
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v22.4s, v22.4s, v14.4s
+       ushr    v13.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v12.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v20.4s, v20.4s, v24.4s
+       orr     v1.16b, v1.16b, v13.16b
+       eor     v4.16b, v4.16b, v10.16b
+       add     v17.4s, v17.4s, v9.4s
+       eor     v25.16b, v25.16b, v22.16b
+       add     v20.4s, v20.4s, v5.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       add     v26.4s, v26.4s, v4.4s
+       eor     v16.16b, v16.16b, v17.16b
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v2.16b, v7.16b, v2.16b
+       add     v21.4s, v21.4s, v6.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v8.16b, v2.16b
-       add     v17.4s, v17.4s, v0.4s
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v13.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       eor     v14.16b, v14.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v7.4s, v7.4s, v21.4s
-       orr     v2.16b, v2.16b, v15.16b
-       tbl     v14.16b, { v14.16b }, v19.16b
-       eor     v11.16b, v11.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       add     v13.4s, v13.4s, v28.4s
-       add     v1.4s, v1.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v19.16b
-       eor     v12.16b, v12.16b, v7.16b
-       add     v13.4s, v13.4s, v2.4s
-       str     q23, [sp, #160]
-       eor     v0.16b, v0.16b, v1.16b
-       add     v5.4s, v5.4s, v11.4s
-       tbl     v12.16b, { v12.16b }, v19.16b
-       eor     v3.16b, v3.16b, v13.16b
-       add     v17.4s, v17.4s, v23.4s
-       ldur    q23, [x29, #-64]
-       ushr    v15.4s, v0.4s, #7
+       eor     v5.16b, v21.16b, v5.16b
+       orr     v0.16b, v0.16b, v12.16b
+       eor     v1.16b, v19.16b, v1.16b
+       add     v10.4s, v10.4s, v11.4s
+       orr     v2.16b, v2.16b, v13.16b
+       ushr    v13.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       ushr    v12.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       add     v22.4s, v22.4s, v15.4s
+       orr     v5.16b, v5.16b, v13.16b
+       add     v20.4s, v20.4s, v3.4s
+       mov     v24.16b, v3.16b
+       ldr     q3, [sp, #336]
+       orr     v1.16b, v1.16b, v12.16b
+       eor     v4.16b, v4.16b, v10.16b
+       add     v22.4s, v22.4s, v2.4s
+       add     v17.4s, v17.4s, v3.4s
+       add     v20.4s, v20.4s, v5.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v25.16b, v25.16b, v22.16b
+       add     v17.4s, v17.4s, v1.4s
+       eor     v6.16b, v6.16b, v20.16b
+       add     v26.4s, v26.4s, v4.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       eor     v16.16b, v16.16b, v17.16b
+       tbl     v6.16b, { v6.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       add     v7.4s, v7.4s, v25.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       add     v21.4s, v21.4s, v6.4s
+       ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
-       eor     v10.16b, v5.16b, v10.16b
-       add     v6.4s, v6.4s, v12.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       orr     v0.16b, v0.16b, v15.16b
-       ushr    v15.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       eor     v9.16b, v6.16b, v9.16b
-       add     v8.4s, v8.4s, v3.4s
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v2.16b, v8.16b, v2.16b
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #7
+       eor     v2.16b, v7.16b, v2.16b
+       add     v19.4s, v19.4s, v16.4s
+       eor     v5.16b, v21.16b, v5.16b
+       orr     v0.16b, v0.16b, v12.16b
+       ushr    v12.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v23.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v24.4s
-       tbl     v3.16b, { v3.16b }, v16.16b
-       eor     v14.16b, v14.16b, v4.16b
-       add     v7.4s, v7.4s, v2.4s
-       add     v6.4s, v6.4s, v3.4s
-       tbl     v14.16b, { v14.16b }, v16.16b
-       eor     v11.16b, v11.16b, v7.16b
-       add     v13.4s, v13.4s, v20.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v8.4s, v8.4s, v14.4s
-       tbl     v11.16b, { v11.16b }, v16.16b
-       add     v13.4s, v13.4s, v0.4s
-       ldr     q20, [sp, #176]
-       ushr    v15.4s, v10.4s, #12
-       shl     v10.4s, v10.4s, #20
-       eor     v9.16b, v8.16b, v9.16b
-       add     v1.4s, v1.4s, v11.4s
-       eor     v12.16b, v12.16b, v13.16b
-       orr     v10.16b, v10.16b, v15.16b
-       ushr    v15.4s, v9.4s, #12
-       shl     v9.4s, v9.4s, #20
-       eor     v2.16b, v1.16b, v2.16b
-       tbl     v12.16b, { v12.16b }, v16.16b
-       orr     v9.16b, v9.16b, v15.16b
-       ushr    v15.4s, v2.4s, #12
-       shl     v2.4s, v2.4s, #20
-       add     v5.4s, v5.4s, v12.4s
+       eor     v1.16b, v19.16b, v1.16b
+       ushr    v13.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v22.4s, v22.4s, v8.4s
+       orr     v2.16b, v2.16b, v12.16b
+       ushr    v12.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       orr     v5.16b, v5.16b, v13.16b
+       add     v22.4s, v22.4s, v0.4s
+       add     v10.4s, v10.4s, v29.4s
+       ldr     q29, [sp, #208]
        add     v17.4s, v17.4s, v31.4s
-       orr     v2.16b, v2.16b, v15.16b
-       eor     v0.16b, v5.16b, v0.16b
-       add     v17.4s, v17.4s, v10.4s
-       add     v4.4s, v4.4s, v20.4s
-       add     v7.4s, v7.4s, v29.4s
-       ushr    v15.4s, v0.4s, #12
+       orr     v1.16b, v1.16b, v12.16b
+       add     v20.4s, v20.4s, v29.4s
+       eor     v16.16b, v16.16b, v22.16b
+       add     v10.4s, v10.4s, v5.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v25.16b, v25.16b, v10.16b
+       eor     v6.16b, v6.16b, v17.16b
+       eor     v4.16b, v4.16b, v20.16b
+       add     v21.4s, v21.4s, v16.4s
+       tbl     v25.16b, { v25.16b }, v27.16b
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v0.16b, v21.16b, v0.16b
+       add     v19.4s, v19.4s, v25.4s
+       add     v26.4s, v26.4s, v6.4s
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v12.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v3.16b, v3.16b, v17.16b
-       add     v4.4s, v4.4s, v9.4s
-       add     v7.4s, v7.4s, v2.4s
-       orr     v0.16b, v0.16b, v15.16b
-       mov     v15.16b, v31.16b
-       add     v17.4s, v17.4s, v22.4s
-       eor     v31.16b, v14.16b, v4.16b
-       eor     v22.16b, v11.16b, v7.16b
-       add     v11.4s, v13.4s, v27.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       add     v11.4s, v11.4s, v0.4s
-       tbl     v31.16b, { v31.16b }, v19.16b
-       add     v6.4s, v6.4s, v3.4s
-       eor     v12.16b, v12.16b, v11.16b
-       tbl     v22.16b, { v22.16b }, v19.16b
-       add     v8.4s, v8.4s, v31.4s
-       eor     v10.16b, v6.16b, v10.16b
-       add     v30.4s, v11.4s, v30.4s
-       tbl     v11.16b, { v12.16b }, v19.16b
-       add     v1.4s, v1.4s, v22.4s
-       eor     v9.16b, v8.16b, v9.16b
-       ushr    v12.4s, v10.4s, #7
-       shl     v10.4s, v10.4s, #25
-       add     v5.4s, v5.4s, v11.4s
-       eor     v2.16b, v1.16b, v2.16b
-       orr     v10.16b, v10.16b, v12.16b
-       ushr    v12.4s, v9.4s, #7
-       shl     v9.4s, v9.4s, #25
-       eor     v0.16b, v5.16b, v0.16b
-       orr     v9.16b, v9.16b, v12.16b
-       ushr    v12.4s, v2.4s, #7
+       eor     v5.16b, v5.16b, v19.16b
+       eor     v2.16b, v26.16b, v2.16b
+       eor     v1.16b, v7.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       ushr    v12.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v22.4s, v22.4s, v14.4s
+       mov     v8.16b, v31.16b
+       ushr    v13.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       mov     v31.16b, v14.16b
+       ushr    v14.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       orr     v5.16b, v5.16b, v12.16b
+       add     v22.4s, v22.4s, v0.4s
+       add     v10.4s, v10.4s, v28.4s
+       ldr     q28, [sp, #352]
+       orr     v2.16b, v2.16b, v13.16b
+       orr     v1.16b, v1.16b, v14.16b
+       add     v17.4s, v17.4s, v30.4s
+       add     v20.4s, v20.4s, v3.4s
+       eor     v16.16b, v16.16b, v22.16b
+       add     v10.4s, v10.4s, v5.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v20.4s, v20.4s, v1.4s
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v25.16b, v25.16b, v10.16b
+       eor     v6.16b, v6.16b, v17.16b
+       eor     v4.16b, v4.16b, v20.16b
+       add     v21.4s, v21.4s, v16.4s
+       tbl     v25.16b, { v25.16b }, v18.16b
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v0.16b, v21.16b, v0.16b
+       add     v19.4s, v19.4s, v25.4s
+       add     v26.4s, v26.4s, v6.4s
+       add     v7.4s, v7.4s, v4.4s
+       ushr    v12.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       eor     v5.16b, v19.16b, v5.16b
+       eor     v2.16b, v26.16b, v2.16b
+       eor     v1.16b, v7.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       add     v10.4s, v10.4s, v23.4s
+       ushr    v13.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
+       ushr    v14.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       orr     v5.16b, v5.16b, v12.16b
+       add     v10.4s, v10.4s, v0.4s
+       add     v20.4s, v20.4s, v24.4s
+       ldr     q24, [sp, #144]
+       orr     v2.16b, v2.16b, v13.16b
+       orr     v1.16b, v1.16b, v14.16b
+       add     v22.4s, v22.4s, v9.4s
+       add     v17.4s, v17.4s, v11.4s
+       eor     v4.16b, v4.16b, v10.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v22.4s, v22.4s, v2.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v27.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v25.16b, v25.16b, v22.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       eor     v0.16b, v26.16b, v0.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v7.4s, v7.4s, v25.4s
+       add     v19.4s, v19.4s, v16.4s
+       ushr    v12.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v2.16b, v7.16b, v2.16b
+       eor     v1.16b, v19.16b, v1.16b
+       orr     v0.16b, v0.16b, v12.16b
+       add     v10.4s, v10.4s, v15.4s
+       ushr    v14.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       mov     v30.16b, v3.16b
+       ldr     q3, [sp, #256]
+       ushr    v12.4s, v2.4s, #12
+       shl     v2.4s, v2.4s, #20
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       add     v10.4s, v10.4s, v0.4s
+       orr     v5.16b, v5.16b, v14.16b
+       add     v20.4s, v20.4s, v3.4s
        orr     v2.16b, v2.16b, v12.16b
+       orr     v1.16b, v1.16b, v13.16b
+       add     v22.4s, v22.4s, v24.4s
+       add     v17.4s, v17.4s, v28.4s
+       eor     v4.16b, v4.16b, v10.16b
+       add     v20.4s, v20.4s, v5.4s
+       add     v22.4s, v22.4s, v2.4s
+       add     v17.4s, v17.4s, v1.4s
+       tbl     v4.16b, { v4.16b }, v18.16b
+       eor     v6.16b, v6.16b, v20.16b
+       eor     v25.16b, v25.16b, v22.16b
+       eor     v16.16b, v16.16b, v17.16b
+       add     v26.4s, v26.4s, v4.4s
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       eor     v0.16b, v26.16b, v0.16b
+       add     v21.4s, v21.4s, v6.4s
+       add     v7.4s, v7.4s, v25.4s
+       add     v19.4s, v19.4s, v16.4s
        ushr    v12.4s, v0.4s, #7
        shl     v0.4s, v0.4s, #25
+       eor     v5.16b, v21.16b, v5.16b
+       eor     v2.16b, v7.16b, v2.16b
+       eor     v1.16b, v19.16b, v1.16b
        orr     v0.16b, v0.16b, v12.16b
-       add     v4.4s, v4.4s, v26.4s
-       add     v17.4s, v17.4s, v0.4s
-       add     v7.4s, v7.4s, v28.4s
-       mov     v18.16b, v27.16b
-       eor     v31.16b, v31.16b, v17.16b
-       add     v4.4s, v4.4s, v10.4s
-       add     v27.4s, v30.4s, v2.4s
-       eor     v22.16b, v22.16b, v4.16b
-       add     v7.4s, v7.4s, v9.4s
-       eor     v3.16b, v3.16b, v27.16b
-       add     v26.4s, v27.4s, v29.4s
-       tbl     v27.16b, { v31.16b }, v16.16b
-       eor     v28.16b, v11.16b, v7.16b
-       tbl     v22.16b, { v22.16b }, v16.16b
-       add     v1.4s, v1.4s, v27.4s
-       add     v4.4s, v4.4s, v23.4s
-       ldr     q23, [sp, #144]
-       tbl     v28.16b, { v28.16b }, v16.16b
-       tbl     v3.16b, { v3.16b }, v16.16b
-       add     v5.4s, v5.4s, v22.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v6.4s, v6.4s, v28.4s
-       add     v29.4s, v8.4s, v3.4s
-       eor     v30.16b, v5.16b, v10.16b
-       ushr    v8.4s, v0.4s, #12
+       ushr    v12.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       mov     v23.16b, v9.16b
+       ldr     q9, [sp, #112]
+       ushr    v13.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       ushr    v14.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       orr     v5.16b, v5.16b, v12.16b
+       add     v9.4s, v10.4s, v9.4s
+       orr     v2.16b, v2.16b, v13.16b
+       orr     v1.16b, v1.16b, v14.16b
+       ldr     q14, [sp, #64]
+       add     v22.4s, v22.4s, v31.4s
+       add     v17.4s, v17.4s, v30.4s
+       add     v20.4s, v20.4s, v8.4s
+       add     v9.4s, v9.4s, v5.4s
+       add     v22.4s, v22.4s, v0.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v20.4s, v20.4s, v1.4s
+       eor     v25.16b, v25.16b, v9.16b
+       eor     v16.16b, v16.16b, v22.16b
+       eor     v6.16b, v6.16b, v17.16b
+       eor     v4.16b, v4.16b, v20.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       tbl     v6.16b, { v6.16b }, v27.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       add     v19.4s, v19.4s, v25.4s
+       add     v21.4s, v21.4s, v16.4s
+       add     v26.4s, v26.4s, v6.4s
+       add     v7.4s, v7.4s, v4.4s
+       eor     v5.16b, v5.16b, v19.16b
+       eor     v0.16b, v21.16b, v0.16b
+       eor     v2.16b, v26.16b, v2.16b
+       eor     v1.16b, v7.16b, v1.16b
+       ushr    v30.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       ushr    v10.4s, v0.4s, #12
        shl     v0.4s, v0.4s, #20
-       eor     v31.16b, v6.16b, v9.16b
-       orr     v0.16b, v0.16b, v8.16b
-       ushr    v8.4s, v30.4s, #12
-       shl     v30.4s, v30.4s, #20
-       eor     v2.16b, v29.16b, v2.16b
-       orr     v30.16b, v30.16b, v8.16b
-       ushr    v8.4s, v31.4s, #12
-       shl     v31.4s, v31.4s, #20
-       add     v17.4s, v17.4s, v25.4s
-       add     v7.4s, v7.4s, v23.4s
-       orr     v31.16b, v31.16b, v8.16b
-       ushr    v8.4s, v2.4s, #12
+       ushr    v12.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
-       ldur    q23, [x29, #-176]
-       orr     v2.16b, v2.16b, v8.16b
-       add     v17.4s, v17.4s, v0.4s
-       eor     v27.16b, v27.16b, v17.16b
-       add     v4.4s, v4.4s, v30.4s
-       add     v25.4s, v26.4s, v2.4s
-       eor     v22.16b, v22.16b, v4.16b
-       add     v4.4s, v4.4s, v24.4s
-       add     v7.4s, v7.4s, v31.4s
-       eor     v3.16b, v3.16b, v25.16b
-       add     v24.4s, v25.4s, v18.4s
-       tbl     v25.16b, { v27.16b }, v19.16b
+       ushr    v13.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       orr     v5.16b, v5.16b, v30.16b
+       add     v30.4s, v9.4s, v29.4s
+       add     v22.4s, v22.4s, v23.4s
+       ldr     q23, [sp, #192]
+       orr     v0.16b, v0.16b, v10.16b
+       orr     v2.16b, v2.16b, v12.16b
+       orr     v1.16b, v1.16b, v13.16b
        add     v17.4s, v17.4s, v23.4s
-       eor     v23.16b, v28.16b, v7.16b
-       tbl     v22.16b, { v22.16b }, v19.16b
-       add     v1.4s, v1.4s, v25.4s
-       tbl     v23.16b, { v23.16b }, v19.16b
-       tbl     v3.16b, { v3.16b }, v19.16b
-       add     v5.4s, v5.4s, v22.4s
-       eor     v0.16b, v0.16b, v1.16b
-       add     v6.4s, v6.4s, v23.4s
-       add     v26.4s, v29.4s, v3.4s
-       eor     v27.16b, v5.16b, v30.16b
-       ushr    v29.4s, v0.4s, #7
-       shl     v0.4s, v0.4s, #25
-       eor     v28.16b, v6.16b, v31.16b
-       orr     v0.16b, v0.16b, v29.16b
-       ushr    v29.4s, v27.4s, #7
-       shl     v27.4s, v27.4s, #25
+       add     v20.4s, v20.4s, v28.4s
+       add     v23.4s, v30.4s, v5.4s
+       add     v22.4s, v22.4s, v0.4s
+       add     v17.4s, v17.4s, v2.4s
+       add     v20.4s, v20.4s, v1.4s
+       eor     v25.16b, v25.16b, v23.16b
+       eor     v16.16b, v16.16b, v22.16b
+       eor     v6.16b, v6.16b, v17.16b
+       eor     v4.16b, v4.16b, v20.16b
+       tbl     v25.16b, { v25.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       tbl     v6.16b, { v6.16b }, v18.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       add     v19.4s, v19.4s, v25.4s
+       add     v21.4s, v21.4s, v16.4s
+       add     v26.4s, v26.4s, v6.4s
+       add     v7.4s, v7.4s, v4.4s
+       eor     v5.16b, v19.16b, v5.16b
+       eor     v0.16b, v21.16b, v0.16b
        eor     v2.16b, v26.16b, v2.16b
-       orr     v27.16b, v27.16b, v29.16b
-       ushr    v29.4s, v28.4s, #7
-       shl     v28.4s, v28.4s, #25
-       ldur    q18, [x29, #-128]
-       orr     v28.16b, v28.16b, v29.16b
-       ushr    v29.4s, v2.4s, #7
+       eor     v1.16b, v7.16b, v1.16b
+       ushr    v28.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       ushr    v30.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v31.4s, v2.4s, #7
        shl     v2.4s, v2.4s, #25
-       add     v7.4s, v7.4s, v15.4s
-       orr     v2.16b, v2.16b, v29.16b
-       add     v17.4s, v17.4s, v27.4s
-       add     v4.4s, v4.4s, v28.4s
-       add     v7.4s, v7.4s, v2.4s
-       eor     v3.16b, v3.16b, v17.16b
-       add     v17.4s, v17.4s, v20.4s
-       eor     v20.16b, v25.16b, v4.16b
-       add     v4.4s, v4.4s, v21.4s
-       eor     v21.16b, v22.16b, v7.16b
-       add     v7.4s, v7.4s, v18.4s
-       add     v18.4s, v24.4s, v0.4s
-       eor     v22.16b, v23.16b, v18.16b
-       ldr     q23, [sp, #160]
-       tbl     v3.16b, { v3.16b }, v16.16b
-       tbl     v20.16b, { v20.16b }, v16.16b
-       add     v6.4s, v6.4s, v3.4s
-       add     v18.4s, v18.4s, v23.4s
-       tbl     v21.16b, { v21.16b }, v16.16b
-       tbl     v16.16b, { v22.16b }, v16.16b
-       add     v22.4s, v26.4s, v20.4s
-       eor     v23.16b, v6.16b, v27.16b
-       add     v1.4s, v1.4s, v21.4s
-       eor     v24.16b, v22.16b, v28.16b
-       ushr    v25.4s, v23.4s, #12
-       shl     v23.4s, v23.4s, #20
-       add     v5.4s, v5.4s, v16.4s
-       eor     v2.16b, v1.16b, v2.16b
-       orr     v23.16b, v23.16b, v25.16b
-       ushr    v25.4s, v24.4s, #12
-       shl     v24.4s, v24.4s, #20
-       eor     v0.16b, v5.16b, v0.16b
-       orr     v24.16b, v24.16b, v25.16b
-       ushr    v25.4s, v2.4s, #12
+       ushr    v8.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       orr     v5.16b, v5.16b, v28.16b
+       ldr     q28, [sp, #176]
+       orr     v0.16b, v0.16b, v30.16b
+       orr     v2.16b, v2.16b, v31.16b
+       orr     v1.16b, v1.16b, v8.16b
+       add     v23.4s, v23.4s, v28.4s
+       add     v22.4s, v22.4s, v11.4s
+       add     v17.4s, v17.4s, v15.4s
+       add     v20.4s, v20.4s, v3.4s
+       ldr     q3, [sp, #272]
+       add     v23.4s, v23.4s, v0.4s
+       add     v22.4s, v22.4s, v2.4s
+       add     v17.4s, v17.4s, v1.4s
+       add     v20.4s, v20.4s, v5.4s
+       eor     v4.16b, v4.16b, v23.16b
+       eor     v25.16b, v25.16b, v22.16b
+       eor     v16.16b, v16.16b, v17.16b
+       eor     v6.16b, v6.16b, v20.16b
+       tbl     v4.16b, { v4.16b }, v27.16b
+       tbl     v25.16b, { v25.16b }, v27.16b
+       tbl     v16.16b, { v16.16b }, v27.16b
+       tbl     v6.16b, { v6.16b }, v27.16b
+       add     v26.4s, v26.4s, v4.4s
+       add     v7.4s, v7.4s, v25.4s
+       add     v19.4s, v19.4s, v16.4s
+       add     v21.4s, v21.4s, v6.4s
+       eor     v0.16b, v26.16b, v0.16b
+       eor     v2.16b, v7.16b, v2.16b
+       eor     v1.16b, v19.16b, v1.16b
+       eor     v5.16b, v21.16b, v5.16b
+       add     v3.4s, v22.4s, v3.4s
+       ldr     q22, [sp, #160]
+       ushr    v28.4s, v0.4s, #12
+       shl     v0.4s, v0.4s, #20
+       ushr    v29.4s, v2.4s, #12
        shl     v2.4s, v2.4s, #20
+       ushr    v30.4s, v1.4s, #12
+       shl     v1.4s, v1.4s, #20
+       ushr    v31.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       add     v17.4s, v17.4s, v22.4s
+       ldr     q22, [sp, #240]
+       orr     v0.16b, v0.16b, v28.16b
+       prfm    pldl1keep, [x23, #256]
+       orr     v2.16b, v2.16b, v29.16b
+       prfm    pldl1keep, [x24, #256]
+       orr     v1.16b, v1.16b, v30.16b
+       prfm    pldl1keep, [x22, #256]
+       orr     v5.16b, v5.16b, v31.16b
+       prfm    pldl1keep, [x25, #256]
+       add     v23.4s, v23.4s, v24.4s
+       add     v20.4s, v20.4s, v22.4s
+       add     v3.4s, v3.4s, v2.4s
+       add     v17.4s, v17.4s, v1.4s
+       add     v22.4s, v23.4s, v0.4s
+       add     v20.4s, v20.4s, v5.4s
+       eor     v23.16b, v25.16b, v3.16b
+       eor     v16.16b, v16.16b, v17.16b
+       eor     v4.16b, v4.16b, v22.16b
+       eor     v6.16b, v6.16b, v20.16b
+       tbl     v23.16b, { v23.16b }, v18.16b
+       tbl     v16.16b, { v16.16b }, v18.16b
+       tbl     v4.16b, { v4.16b }, v18.16b
+       tbl     v6.16b, { v6.16b }, v18.16b
+       add     v7.4s, v7.4s, v23.4s
+       add     v19.4s, v19.4s, v16.4s
+       add     v18.4s, v26.4s, v4.4s
+       add     v21.4s, v21.4s, v6.4s
+       eor     v2.16b, v7.16b, v2.16b
+       eor     v1.16b, v19.16b, v1.16b
+       eor     v0.16b, v18.16b, v0.16b
+       eor     v5.16b, v21.16b, v5.16b
+       ushr    v25.4s, v2.4s, #7
+       shl     v2.4s, v2.4s, #25
+       ushr    v24.4s, v0.4s, #7
+       shl     v0.4s, v0.4s, #25
+       ushr    v26.4s, v1.4s, #7
+       shl     v1.4s, v1.4s, #25
+       ushr    v27.4s, v5.4s, #7
+       shl     v5.4s, v5.4s, #25
+       orr     v0.16b, v0.16b, v24.16b
        orr     v2.16b, v2.16b, v25.16b
-       ushr    v25.4s, v0.4s, #12
-       shl     v0.4s, v0.4s, #20
-       orr     v0.16b, v0.16b, v25.16b
-       add     v25.4s, v7.4s, v2.4s
-       add     v26.4s, v18.4s, v0.4s
-       eor     v18.16b, v21.16b, v25.16b
-       add     v17.4s, v17.4s, v23.4s
-       add     v4.4s, v4.4s, v24.4s
-       eor     v16.16b, v16.16b, v26.16b
-       tbl     v21.16b, { v18.16b }, v19.16b
-       eor     v3.16b, v3.16b, v17.16b
-       eor     v7.16b, v20.16b, v4.16b
-       tbl     v16.16b, { v16.16b }, v19.16b
-       add     v1.4s, v1.4s, v21.4s
-       tbl     v3.16b, { v3.16b }, v19.16b
-       tbl     v20.16b, { v7.16b }, v19.16b
-       eor     v2.16b, v1.16b, v2.16b
-       eor     v7.16b, v1.16b, v17.16b
-       add     v1.4s, v5.4s, v16.4s
-       eor     v0.16b, v1.16b, v0.16b
-       eor     v18.16b, v1.16b, v4.16b
-       add     v1.4s, v6.4s, v3.4s
-       eor     v4.16b, v1.16b, v23.16b
-       eor     v6.16b, v25.16b, v1.16b
-       add     v1.4s, v22.4s, v20.4s
-       eor     v5.16b, v1.16b, v24.16b
-       eor     v17.16b, v26.16b, v1.16b
-       ushr    v1.4s, v4.4s, #7
+       orr     v1.16b, v1.16b, v26.16b
+       orr     v5.16b, v5.16b, v27.16b
+       movi    v13.4s, #64
+       eor     v29.16b, v19.16b, v22.16b
+       eor     v8.16b, v21.16b, v3.16b
+       eor     v30.16b, v17.16b, v18.16b
+       eor     v31.16b, v20.16b, v7.16b
+       eor     v24.16b, v5.16b, v23.16b
+       eor     v18.16b, v0.16b, v16.16b
+       eor     v25.16b, v2.16b, v6.16b
+       eor     v26.16b, v1.16b, v4.16b
+       cbnz    x21, .LBB3_5
+       b       .LBB3_2
+.LBB3_6:
+       cbz     x1, .LBB3_14
+       adrp    x12, .LCPI3_3
+       ldr     q0, [x11, :lo12:.LCPI3_1]
+       orr     w11, w7, w6
+       ldr     q2, [x10, :lo12:.LCPI3_2]
+       ldr     q1, [x12, :lo12:.LCPI3_3]
+       and     x12, x5, #0x1
+.LBB3_8:
+       movi    v3.4s, #64
+       lsr     x13, x4, #32
+       ldp     q5, q4, [x3]
+       mov     x15, x2
+       mov     w14, w11
+       mov     v3.s[0], w4
+       ldr     x10, [x0]
+       mov     v3.s[1], w13
+       b       .LBB3_11
+.LBB3_9:
+       orr     w14, w14, w9
+.LBB3_10:
+       ldp     q6, q7, [x10]
+       mov     v16.16b, v3.16b
+       and     w14, w14, #0xff
+       add     v5.4s, v5.4s, v4.4s
+       mov     x15, x13
+       mov     v16.s[3], w14
+       add     x14, x10, #32
+       uzp1    v17.4s, v6.4s, v7.4s
+       add     x10, x10, #64
+       add     v5.4s, v5.4s, v17.4s
+       eor     v16.16b, v5.16b, v16.16b
+       tbl     v16.16b, { v16.16b }, v0.16b
+       add     v18.4s, v16.4s, v1.4s
+       eor     v19.16b, v18.16b, v4.16b
+       uzp2    v4.4s, v6.4s, v7.4s
+       ushr    v6.4s, v19.4s, #12
+       shl     v7.4s, v19.4s, #20
+       ld2     { v19.4s, v20.4s }, [x14]
+       add     v5.4s, v5.4s, v4.4s
+       mov     w14, w6
+       orr     v6.16b, v7.16b, v6.16b
+       add     v5.4s, v5.4s, v6.4s
+       eor     v7.16b, v16.16b, v5.16b
+       add     v5.4s, v5.4s, v19.4s
+       tbl     v7.16b, { v7.16b }, v2.16b
+       ext     v5.16b, v5.16b, v5.16b, #12
+       add     v16.4s, v18.4s, v7.4s
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v6.16b, v6.16b, v16.16b
+       ext     v16.16b, v16.16b, v16.16b, #4
+       ushr    v18.4s, v6.4s, #7
+       shl     v6.4s, v6.4s, #25
+       orr     v6.16b, v6.16b, v18.16b
+       ext     v18.16b, v20.16b, v20.16b, #12
+       add     v5.4s, v5.4s, v6.4s
+       eor     v7.16b, v5.16b, v7.16b
+       add     v5.4s, v5.4s, v18.4s
+       tbl     v7.16b, { v7.16b }, v0.16b
+       add     v16.4s, v16.4s, v7.4s
+       eor     v6.16b, v6.16b, v16.16b
+       ushr    v21.4s, v6.4s, #12
+       shl     v6.4s, v6.4s, #20
+       orr     v6.16b, v6.16b, v21.16b
+       uzp1    v21.4s, v17.4s, v17.4s
+       add     v5.4s, v5.4s, v6.4s
+       ext     v21.16b, v21.16b, v17.16b, #8
+       eor     v7.16b, v7.16b, v5.16b
+       uzp2    v21.4s, v21.4s, v4.4s
+       tbl     v7.16b, { v7.16b }, v2.16b
+       add     v5.4s, v5.4s, v21.4s
+       add     v16.4s, v16.4s, v7.4s
+       ext     v5.16b, v5.16b, v5.16b, #4
+       ext     v7.16b, v7.16b, v7.16b, #8
+       eor     v6.16b, v6.16b, v16.16b
+       ushr    v22.4s, v6.4s, #7
+       shl     v6.4s, v6.4s, #25
+       orr     v6.16b, v6.16b, v22.16b
+       add     v22.4s, v5.4s, v6.4s
+       eor     v5.16b, v22.16b, v7.16b
+       ext     v7.16b, v16.16b, v16.16b, #12
+       tbl     v16.16b, { v5.16b }, v0.16b
+       ext     v5.16b, v17.16b, v17.16b, #12
+       add     v7.4s, v7.4s, v16.4s
+       ext     v5.16b, v17.16b, v5.16b, #12
+       ext     v17.16b, v19.16b, v19.16b, #12
+       mov     v19.16b, v18.16b
+       eor     v6.16b, v6.16b, v7.16b
+       rev64   v5.4s, v5.4s
+       mov     v19.s[1], v17.s[2]
+       ushr    v20.4s, v6.4s, #12
+       shl     v6.4s, v6.4s, #20
+       trn2    v5.4s, v5.4s, v19.4s
+       orr     v6.16b, v6.16b, v20.16b
+       zip1    v20.2d, v18.2d, v4.2d
+       zip2    v4.4s, v4.4s, v18.4s
+       add     v19.4s, v6.4s, v5.4s
+       mov     v20.s[3], v17.s[3]
+       add     v19.4s, v19.4s, v22.4s
+       ext     v22.16b, v20.16b, v20.16b, #12
+       eor     v16.16b, v16.16b, v19.16b
+       ext     v19.16b, v19.16b, v19.16b, #12
+       tbl     v16.16b, { v16.16b }, v2.16b
+       add     v7.4s, v7.4s, v16.4s
+       ext     v16.16b, v16.16b, v16.16b, #8
+       eor     v6.16b, v6.16b, v7.16b
+       ext     v7.16b, v7.16b, v7.16b, #4
+       ushr    v23.4s, v6.4s, #7
+       shl     v24.4s, v6.4s, #25
+       uzp1    v6.4s, v20.4s, v22.4s
+       orr     v20.16b, v24.16b, v23.16b
+       add     v22.4s, v20.4s, v6.4s
+       add     v19.4s, v22.4s, v19.4s
+       eor     v16.16b, v19.16b, v16.16b
+       tbl     v16.16b, { v16.16b }, v0.16b
+       add     v7.4s, v7.4s, v16.4s
+       eor     v18.16b, v20.16b, v7.16b
+       zip1    v20.4s, v4.4s, v17.4s
+       zip1    v4.4s, v17.4s, v4.4s
+       ushr    v17.4s, v18.4s, #12
+       shl     v18.4s, v18.4s, #20
+       ext     v20.16b, v4.16b, v20.16b, #8
+       orr     v4.16b, v18.16b, v17.16b
+       ext     v18.16b, v21.16b, v21.16b, #4
+       add     v17.4s, v4.4s, v20.4s
+       add     v17.4s, v17.4s, v19.4s
+       uzp1    v19.4s, v18.4s, v18.4s
+       eor     v16.16b, v16.16b, v17.16b
+       ext     v19.16b, v19.16b, v18.16b, #8
+       tbl     v16.16b, { v16.16b }, v2.16b
+       uzp2    v19.4s, v19.4s, v5.4s
+       add     v7.4s, v7.4s, v16.4s
+       add     v17.4s, v17.4s, v19.4s
+       ext     v16.16b, v16.16b, v16.16b, #8
+       eor     v4.16b, v4.16b, v7.16b
+       ext     v17.16b, v17.16b, v17.16b, #4
+       ext     v7.16b, v7.16b, v7.16b, #12
+       ushr    v21.4s, v4.4s, #7
        shl     v4.4s, v4.4s, #25
-       orr     v1.16b, v4.16b, v1.16b
-       ushr    v4.4s, v5.4s, #7
+       orr     v4.16b, v4.16b, v21.16b
+       ext     v21.16b, v18.16b, v18.16b, #12
+       add     v17.4s, v17.4s, v4.4s
+       ext     v18.16b, v18.16b, v21.16b, #12
+       mov     v21.16b, v20.16b
+       eor     v16.16b, v17.16b, v16.16b
+       rev64   v18.4s, v18.4s
+       mov     v21.s[1], v6.s[2]
+       tbl     v16.16b, { v16.16b }, v0.16b
+       add     v7.4s, v7.4s, v16.4s
+       eor     v4.16b, v4.16b, v7.16b
+       ushr    v22.4s, v4.4s, #12
+       shl     v23.4s, v4.4s, #20
+       trn2    v4.4s, v18.4s, v21.4s
+       orr     v18.16b, v23.16b, v22.16b
+       add     v21.4s, v18.4s, v4.4s
+       add     v17.4s, v21.4s, v17.4s
+       zip1    v21.2d, v20.2d, v5.2d
+       zip2    v5.4s, v5.4s, v20.4s
+       eor     v16.16b, v16.16b, v17.16b
+       mov     v21.s[3], v6.s[3]
+       ext     v17.16b, v17.16b, v17.16b, #12
+       zip1    v20.4s, v5.4s, v6.4s
+       tbl     v16.16b, { v16.16b }, v2.16b
+       zip1    v5.4s, v6.4s, v5.4s
+       add     v22.4s, v7.4s, v16.4s
+       ext     v16.16b, v16.16b, v16.16b, #8
+       ext     v20.16b, v5.16b, v20.16b, #8
+       eor     v7.16b, v18.16b, v22.16b
+       ext     v18.16b, v21.16b, v21.16b, #12
+       ushr    v23.4s, v7.4s, #7
+       shl     v24.4s, v7.4s, #25
+       uzp1    v7.4s, v21.4s, v18.4s
+       orr     v18.16b, v24.16b, v23.16b
+       add     v21.4s, v18.4s, v7.4s
+       add     v17.4s, v21.4s, v17.4s
+       ext     v21.16b, v22.16b, v22.16b, #4
+       eor     v16.16b, v17.16b, v16.16b
+       tbl     v16.16b, { v16.16b }, v0.16b
+       add     v21.4s, v21.4s, v16.4s
+       eor     v18.16b, v18.16b, v21.16b
+       ushr    v6.4s, v18.4s, #12
+       shl     v18.4s, v18.4s, #20
+       orr     v5.16b, v18.16b, v6.16b
+       add     v6.4s, v5.4s, v20.4s
+       add     v6.4s, v6.4s, v17.4s
+       ext     v17.16b, v19.16b, v19.16b, #4
+       eor     v16.16b, v16.16b, v6.16b
+       uzp1    v18.4s, v17.4s, v17.4s
+       tbl     v16.16b, { v16.16b }, v2.16b
+       ext     v18.16b, v18.16b, v17.16b, #8
+       add     v19.4s, v21.4s, v16.4s
+       uzp2    v18.4s, v18.4s, v4.4s
+       ext     v16.16b, v16.16b, v16.16b, #8
+       eor     v5.16b, v5.16b, v19.16b
+       add     v6.4s, v6.4s, v18.4s
+       ext     v19.16b, v19.16b, v19.16b, #12
+       ushr    v21.4s, v5.4s, #7
        shl     v5.4s, v5.4s, #25
-       orr     v4.16b, v5.16b, v4.16b
-       ushr    v5.4s, v2.4s, #7
-       shl     v2.4s, v2.4s, #25
-       orr     v2.16b, v2.16b, v5.16b
-       ushr    v5.4s, v0.4s, #7
-       shl     v0.4s, v0.4s, #25
-       orr     v0.16b, v0.16b, v5.16b
-       eor     v10.16b, v0.16b, v20.16b
-       eor     v11.16b, v1.16b, v21.16b
-       eor     v19.16b, v4.16b, v16.16b
-       cmp     x0, x22
-       eor     v16.16b, v2.16b, v3.16b
-       mov     w6, w19
-       b.ne    .LBB2_4
-.LBB2_7:
-       zip1    v0.4s, v7.4s, v18.4s
-       zip2    v1.4s, v7.4s, v18.4s
-       zip1    v2.4s, v6.4s, v17.4s
-       zip2    v3.4s, v6.4s, v17.4s
-       zip1    v4.4s, v10.4s, v11.4s
-       zip2    v5.4s, v10.4s, v11.4s
-       zip1    v6.4s, v19.4s, v16.4s
-       zip2    v7.4s, v19.4s, v16.4s
-       add     x15, x20, #4
-       tst     w5, #0x1
-       sub     x28, x28, #4
-       zip1    v16.2d, v0.2d, v2.2d
-       zip2    v0.2d, v0.2d, v2.2d
-       zip1    v2.2d, v1.2d, v3.2d
-       zip2    v1.2d, v1.2d, v3.2d
-       zip1    v3.2d, v4.2d, v6.2d
-       zip2    v4.2d, v4.2d, v6.2d
-       zip1    v6.2d, v5.2d, v7.2d
-       zip2    v5.2d, v5.2d, v7.2d
-       add     x24, x24, #32
-       csel    x20, x15, x20, ne
-       cmp     x28, #3
-       stp     q16, q3, [x26]
-       stp     q0, q4, [x26, #32]
-       stp     q2, q6, [x26, #64]
-       stp     q1, q5, [x26, #96]
-       add     x26, x26, #128
-       b.hi    .LBB2_2
-.LBB2_8:
-       cbz     x28, .LBB2_16
-       orr     w8, w7, w19
-       and     x21, x5, #0x1
-       stur    w8, [x29, #-64]
-.LBB2_10:
-       ldr     x8, [sp, #40]
-       ldr     x25, [x24]
-       ldur    w4, [x29, #-64]
-       ldp     q1, q0, [x8]
-       mov     x8, x22
-       stp     q1, q0, [x29, #-48]
-.LBB2_11:
-       subs    x23, x8, #1
-       b.eq    .LBB2_13
-       cbnz    x8, .LBB2_14
-       b       .LBB2_15
-.LBB2_13:
-       orr     w4, w4, w27
-.LBB2_14:
-       sub     x0, x29, #48
-       mov     w2, #64
-       mov     x1, x25
-       mov     x3, x20
-       bl      zfs_blake3_compress_in_place_sse41
-       add     x25, x25, #64
-       mov     x8, x23
-       mov     w4, w19
-       b       .LBB2_11
-.LBB2_15:
-       ldp     q0, q1, [x29, #-48]
-       add     x20, x20, x21
-       add     x24, x24, #8
-       subs    x28, x28, #1
-       stp     q0, q1, [x26], #32
-       b.ne    .LBB2_10
-.LBB2_16:
-       add     sp, sp, #448
-       ldp     x20, x19, [sp, #144]
-       ldp     x22, x21, [sp, #128]
-       ldp     x24, x23, [sp, #112]
-       ldp     x26, x25, [sp, #96]
-       ldp     x28, x27, [sp, #80]
-       ldp     x29, x30, [sp, #64]
+       ext     v6.16b, v6.16b, v6.16b, #4
+       orr     v5.16b, v5.16b, v21.16b
+       ext     v21.16b, v17.16b, v17.16b, #12
+       add     v6.4s, v6.4s, v5.4s
+       ext     v17.16b, v17.16b, v21.16b, #12
+       mov     v21.16b, v20.16b
+       eor     v16.16b, v6.16b, v16.16b
+       rev64   v17.4s, v17.4s
+       mov     v21.s[1], v7.s[2]
+       tbl     v16.16b, { v16.16b }, v0.16b
+       add     v19.4s, v19.4s, v16.4s
+       eor     v5.16b, v5.16b, v19.16b
+       ushr    v22.4s, v5.4s, #12
+       shl     v23.4s, v5.4s, #20
+       trn2    v5.4s, v17.4s, v21.4s
+       orr     v17.16b, v23.16b, v22.16b
+       add     v21.4s, v17.4s, v5.4s
+       add     v6.4s, v21.4s, v6.4s
+       eor     v16.16b, v16.16b, v6.16b
+       ext     v6.16b, v6.16b, v6.16b, #12
+       tbl     v21.16b, { v16.16b }, v2.16b
+       zip1    v16.2d, v20.2d, v4.2d
+       zip2    v4.4s, v4.4s, v20.4s
+       add     v19.4s, v19.4s, v21.4s
+       mov     v16.s[3], v7.s[3]
+       ext     v21.16b, v21.16b, v21.16b, #8
+       zip1    v20.4s, v4.4s, v7.4s
+       eor     v17.16b, v17.16b, v19.16b
+       ext     v22.16b, v16.16b, v16.16b, #12
+       ext     v19.16b, v19.16b, v19.16b, #4
+       zip1    v4.4s, v7.4s, v4.4s
+       ushr    v23.4s, v17.4s, #7
+       shl     v17.4s, v17.4s, #25
+       uzp1    v16.4s, v16.4s, v22.4s
+       ext     v4.16b, v4.16b, v20.16b, #8
+       orr     v17.16b, v17.16b, v23.16b
+       add     v22.4s, v17.4s, v16.4s
+       add     v6.4s, v22.4s, v6.4s
+       eor     v21.16b, v6.16b, v21.16b
+       tbl     v21.16b, { v21.16b }, v0.16b
+       add     v19.4s, v19.4s, v21.4s
+       eor     v17.16b, v17.16b, v19.16b
+       ushr    v7.4s, v17.4s, #12
+       shl     v17.4s, v17.4s, #20
+       orr     v7.16b, v17.16b, v7.16b
+       add     v17.4s, v7.4s, v4.4s
+       add     v6.4s, v17.4s, v6.4s
+       ext     v17.16b, v18.16b, v18.16b, #4
+       eor     v18.16b, v21.16b, v6.16b
+       uzp1    v20.4s, v17.4s, v17.4s
+       tbl     v18.16b, { v18.16b }, v2.16b
+       ext     v20.16b, v20.16b, v17.16b, #8
+       add     v19.4s, v19.4s, v18.4s
+       uzp2    v20.4s, v20.4s, v5.4s
+       ext     v18.16b, v18.16b, v18.16b, #8
+       eor     v7.16b, v7.16b, v19.16b
+       add     v6.4s, v6.4s, v20.4s
+       ushr    v21.4s, v7.4s, #7
+       shl     v7.4s, v7.4s, #25
+       ext     v6.16b, v6.16b, v6.16b, #4
+       orr     v7.16b, v7.16b, v21.16b
+       add     v21.4s, v6.4s, v7.4s
+       eor     v6.16b, v21.16b, v18.16b
+       ext     v18.16b, v19.16b, v19.16b, #12
+       tbl     v19.16b, { v6.16b }, v0.16b
+       ext     v6.16b, v17.16b, v17.16b, #12
+       add     v18.4s, v18.4s, v19.4s
+       ext     v6.16b, v17.16b, v6.16b, #12
+       mov     v17.16b, v4.16b
+       eor     v7.16b, v7.16b, v18.16b
+       rev64   v6.4s, v6.4s
+       mov     v17.s[1], v16.s[2]
+       ushr    v22.4s, v7.4s, #12
+       shl     v7.4s, v7.4s, #20
+       trn2    v6.4s, v6.4s, v17.4s
+       orr     v7.16b, v7.16b, v22.16b
+       add     v17.4s, v7.4s, v6.4s
+       add     v17.4s, v17.4s, v21.4s
+       zip1    v21.2d, v4.2d, v5.2d
+       zip2    v4.4s, v5.4s, v4.4s
+       eor     v19.16b, v19.16b, v17.16b
+       mov     v21.s[3], v16.s[3]
+       ext     v17.16b, v17.16b, v17.16b, #12
+       tbl     v19.16b, { v19.16b }, v2.16b
+       ext     v22.16b, v21.16b, v21.16b, #12
+       add     v18.4s, v18.4s, v19.4s
+       ext     v19.16b, v19.16b, v19.16b, #8
+       eor     v7.16b, v7.16b, v18.16b
+       ext     v18.16b, v18.16b, v18.16b, #4
+       ushr    v23.4s, v7.4s, #7
+       shl     v24.4s, v7.4s, #25
+       uzp1    v7.4s, v21.4s, v22.4s
+       orr     v21.16b, v24.16b, v23.16b
+       add     v22.4s, v21.4s, v7.4s
+       add     v17.4s, v22.4s, v17.4s
+       eor     v19.16b, v17.16b, v19.16b
+       tbl     v19.16b, { v19.16b }, v0.16b
+       add     v18.4s, v18.4s, v19.4s
+       eor     v5.16b, v21.16b, v18.16b
+       zip1    v21.4s, v4.4s, v16.4s
+       zip1    v4.4s, v16.4s, v4.4s
+       ushr    v16.4s, v5.4s, #12
+       shl     v5.4s, v5.4s, #20
+       ext     v21.16b, v4.16b, v21.16b, #8
+       orr     v4.16b, v5.16b, v16.16b
+       ext     v16.16b, v20.16b, v20.16b, #4
+       mov     v23.16b, v21.16b
+       add     v5.4s, v4.4s, v21.4s
+       mov     v23.s[1], v7.s[2]
+       add     v5.4s, v5.4s, v17.4s
+       eor     v17.16b, v19.16b, v5.16b
+       uzp1    v19.4s, v16.4s, v16.4s
+       tbl     v17.16b, { v17.16b }, v2.16b
+       ext     v19.16b, v19.16b, v16.16b, #8
+       add     v18.4s, v18.4s, v17.4s
+       uzp2    v19.4s, v19.4s, v6.4s
+       eor     v4.16b, v4.16b, v18.16b
+       add     v5.4s, v5.4s, v19.4s
+       ext     v19.16b, v19.16b, v19.16b, #4
+       ushr    v20.4s, v4.4s, #7
+       shl     v4.4s, v4.4s, #25
+       ext     v5.16b, v5.16b, v5.16b, #4
+       orr     v20.16b, v4.16b, v20.16b
+       ext     v4.16b, v17.16b, v17.16b, #8
+       add     v17.4s, v5.4s, v20.4s
+       ext     v5.16b, v18.16b, v18.16b, #12
+       eor     v4.16b, v17.16b, v4.16b
+       tbl     v18.16b, { v4.16b }, v0.16b
+       ext     v4.16b, v16.16b, v16.16b, #12
+       add     v22.4s, v5.4s, v18.4s
+       ext     v4.16b, v16.16b, v4.16b, #12
+       eor     v5.16b, v20.16b, v22.16b
+       rev64   v16.4s, v4.4s
+       ushr    v20.4s, v5.4s, #12
+       shl     v24.4s, v5.4s, #20
+       trn2    v5.4s, v16.4s, v23.4s
+       orr     v16.16b, v24.16b, v20.16b
+       add     v20.4s, v16.4s, v5.4s
+       add     v17.4s, v20.4s, v17.4s
+       zip1    v20.2d, v21.2d, v6.2d
+       zip2    v6.4s, v6.4s, v21.4s
+       eor     v18.16b, v18.16b, v17.16b
+       mov     v20.s[3], v7.s[3]
+       ext     v17.16b, v17.16b, v17.16b, #12
+       zip1    v21.4s, v6.4s, v7.4s
+       tbl     v18.16b, { v18.16b }, v2.16b
+       ext     v24.16b, v20.16b, v20.16b, #12
+       zip1    v6.4s, v7.4s, v6.4s
+       add     v22.4s, v22.4s, v18.4s
+       ext     v18.16b, v18.16b, v18.16b, #8
+       ext     v6.16b, v6.16b, v21.16b, #8
+       eor     v16.16b, v16.16b, v22.16b
+       ext     v22.16b, v22.16b, v22.16b, #4
+       zip1    v5.2d, v6.2d, v5.2d
+       zip2    v4.4s, v4.4s, v6.4s
+       ushr    v25.4s, v16.4s, #7
+       shl     v26.4s, v16.4s, #25
+       uzp1    v16.4s, v20.4s, v24.4s
+       orr     v20.16b, v26.16b, v25.16b
+       mov     v5.s[3], v16.s[3]
+       add     v24.4s, v20.4s, v16.4s
+       add     v17.4s, v24.4s, v17.4s
+       eor     v18.16b, v17.16b, v18.16b
+       tbl     v18.16b, { v18.16b }, v0.16b
+       add     v22.4s, v22.4s, v18.4s
+       eor     v20.16b, v20.16b, v22.16b
+       ushr    v7.4s, v20.4s, #12
+       shl     v20.4s, v20.4s, #20
+       orr     v7.16b, v20.16b, v7.16b
+       add     v20.4s, v7.4s, v6.4s
+       add     v17.4s, v20.4s, v17.4s
+       ext     v20.16b, v19.16b, v19.16b, #8
+       eor     v18.16b, v18.16b, v17.16b
+       ext     v17.16b, v17.16b, v17.16b, #4
+       tbl     v18.16b, { v18.16b }, v2.16b
+       add     v21.4s, v22.4s, v18.4s
+       uzp2    v22.4s, v20.4s, v23.4s
+       ext     v18.16b, v18.16b, v18.16b, #8
+       eor     v7.16b, v7.16b, v21.16b
+       ext     v20.16b, v22.16b, v20.16b, #4
+       ushr    v22.4s, v7.4s, #7
+       shl     v7.4s, v7.4s, #25
+       add     v17.4s, v17.4s, v20.4s
+       ext     v20.16b, v21.16b, v21.16b, #12
+       ext     v21.16b, v19.16b, v19.16b, #12
+       orr     v7.16b, v7.16b, v22.16b
+       ext     v19.16b, v19.16b, v21.16b, #12
+       add     v17.4s, v17.4s, v7.4s
+       mov     v21.16b, v6.16b
+       rev64   v19.4s, v19.4s
+       eor     v18.16b, v17.16b, v18.16b
+       mov     v21.s[1], v16.s[2]
+       tbl     v18.16b, { v18.16b }, v0.16b
+       trn2    v19.4s, v19.4s, v21.4s
+       add     v20.4s, v20.4s, v18.4s
+       eor     v7.16b, v7.16b, v20.16b
+       ushr    v22.4s, v7.4s, #12
+       shl     v7.4s, v7.4s, #20
+       orr     v7.16b, v7.16b, v22.16b
+       add     v19.4s, v7.4s, v19.4s
+       add     v17.4s, v19.4s, v17.4s
+       eor     v18.16b, v18.16b, v17.16b
+       ext     v17.16b, v17.16b, v17.16b, #12
+       tbl     v18.16b, { v18.16b }, v2.16b
+       add     v19.4s, v20.4s, v18.4s
+       ext     v20.16b, v5.16b, v5.16b, #12
+       ext     v18.16b, v18.16b, v18.16b, #8
+       eor     v7.16b, v7.16b, v19.16b
+       uzp1    v5.4s, v5.4s, v20.4s
+       ushr    v21.4s, v7.4s, #7
+       shl     v7.4s, v7.4s, #25
+       orr     v7.16b, v7.16b, v21.16b
+       add     v5.4s, v7.4s, v5.4s
+       add     v5.4s, v5.4s, v17.4s
+       eor     v17.16b, v5.16b, v18.16b
+       ext     v18.16b, v19.16b, v19.16b, #4
+       tbl     v17.16b, { v17.16b }, v0.16b
+       add     v18.4s, v18.4s, v17.4s
+       eor     v6.16b, v7.16b, v18.16b
+       zip1    v7.4s, v4.4s, v16.4s
+       zip1    v4.4s, v16.4s, v4.4s
+       ushr    v16.4s, v6.4s, #12
+       shl     v6.4s, v6.4s, #20
+       ext     v4.16b, v4.16b, v7.16b, #8
+       orr     v6.16b, v6.16b, v16.16b
+       add     v4.4s, v6.4s, v4.4s
+       add     v4.4s, v4.4s, v5.4s
+       eor     v5.16b, v17.16b, v4.16b
+       ext     v4.16b, v4.16b, v4.16b, #4
+       tbl     v5.16b, { v5.16b }, v2.16b
+       add     v7.4s, v18.4s, v5.4s
+       eor     v6.16b, v6.16b, v7.16b
+       ext     v7.16b, v7.16b, v7.16b, #12
+       ushr    v16.4s, v6.4s, #7
+       shl     v6.4s, v6.4s, #25
+       orr     v6.16b, v6.16b, v16.16b
+       ext     v16.16b, v5.16b, v5.16b, #8
+       eor     v5.16b, v4.16b, v7.16b
+       eor     v4.16b, v6.16b, v16.16b
+.LBB3_11:
+       subs    x13, x15, #1
+       b.eq    .LBB3_9
+       cbnz    x15, .LBB3_10
+       add     x4, x4, x12
+       add     x0, x0, #8
+       subs    x1, x1, #1
+       stp     q5, q4, [x8], #32
+       b.ne    .LBB3_8
+.LBB3_14:
+       add     sp, sp, #368
+       ldp     x20, x19, [sp, #128]
+       ldp     x22, x21, [sp, #112]
+       ldp     x24, x23, [sp, #96]
+       ldp     x26, x25, [sp, #80]
+       ldp     x29, x27, [sp, #64]
        ldp     d9, d8, [sp, #48]
        ldp     d11, d10, [sp, #32]
        ldp     d13, d12, [sp, #16]
-       ldp     d15, d14, [sp], #160
+       ldp     d15, d14, [sp], #144
        ret
-.Lfunc_end2:
-       .size   zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+.Lfunc_end3:
+       .size   zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41
        .cfi_endproc
        .section        ".note.GNU-stack","",@progbits
-#endif
+#endif
\ No newline at end of file