]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add generic implementation handling and SHA2 impl
authorTino Reichardt <milky-zfs@mcmilk.de>
Wed, 1 Mar 2023 08:40:28 +0000 (09:40 +0100)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 2 Mar 2023 21:52:21 +0000 (13:52 -0800)
The skeleton file module/icp/include/generic_impl.c can be used for
iterating over different implementations of algorithms.

It is used by SHA256, SHA512 and BLAKE3 currently.

The Solaris SHA2 implementation got replaced with a version which is
based on public domain code of cppcrypto v0.10.

These assembly files are taken from current openssl master:
- sha256-x86_64.S: x64, SSSE3, AVX, AVX2, SHA-NI (x86_64)
- sha512-x86_64.S: x64, AVX, AVX2 (x86_64)
- sha256-armv7.S: ARMv7, NEON, ARMv8-CE (arm)
- sha512-armv7.S: ARMv7, NEON (arm)
- sha256-armv8.S: ARMv7, NEON, ARMv8-CE (aarch64)
- sha512-armv8.S: ARMv7, ARMv8-CE (aarch64)
- sha256-ppc.S: Generic PPC64 LE/BE (ppc64)
- sha512-ppc.S: Generic PPC64 LE/BE (ppc64)
- sha256-p8.S: Power8 ISA Version 2.07 LE/BE (ppc64)
- sha512-p8.S: Power8 ISA Version 2.07 LE/BE (ppc64)

Tested-by: Rich Ercolani <rincebrain@gmail.com>
Tested-by: Sebastian Gottschall <s.gottschall@dd-wrt.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Closes #13741

31 files changed:
include/Makefile.am
include/sys/sha2.h [new file with mode: 0644]
include/sys/zfs_impl.h [new file with mode: 0644]
include/sys/zio_checksum.h
lib/libicp/Makefile.am
lib/libzfs/Makefile.am
lib/libzpool/Makefile.am
module/Kbuild.in
module/Makefile.bsd
module/icp/algs/sha2/sha256_impl.c [new file with mode: 0644]
module/icp/algs/sha2/sha2_generic.c [new file with mode: 0644]
module/icp/algs/sha2/sha512_impl.c [new file with mode: 0644]
module/icp/asm-aarch64/sha2/sha256-armv8.S [new file with mode: 0644]
module/icp/asm-aarch64/sha2/sha512-armv8.S [new file with mode: 0644]
module/icp/asm-arm/sha2/sha256-armv7.S [new file with mode: 0644]
module/icp/asm-arm/sha2/sha512-armv7.S [new file with mode: 0644]
module/icp/asm-ppc64/sha2/sha256-p8.S [new file with mode: 0644]
module/icp/asm-ppc64/sha2/sha256-ppc.S [new file with mode: 0644]
module/icp/asm-ppc64/sha2/sha512-p8.S [new file with mode: 0644]
module/icp/asm-ppc64/sha2/sha512-ppc.S [new file with mode: 0644]
module/icp/asm-x86_64/sha2/sha256-x86_64.S [new file with mode: 0644]
module/icp/asm-x86_64/sha2/sha512-x86_64.S [new file with mode: 0644]
module/icp/include/generic_impl.c [new file with mode: 0644]
module/icp/include/sha2/sha2_impl.h
module/icp/io/sha2_mod.c
module/zfs/sha256.c [deleted file]
module/zfs/sha2_zfs.c [new file with mode: 0644]
module/zfs/zfs_chksum.c
module/zfs/zfs_impl.c [new file with mode: 0644]
module/zfs/zio_checksum.c
tests/zfs-tests/cmd/checksum/sha2_test.c

index 1e5c71150eebe6fe5b5df8ba4cf60e8427786e7b..6897e3c5e337b4038977d7b9658f262f460f0c39 100644 (file)
@@ -75,6 +75,7 @@ COMMON_H = \
        sys/rrwlock.h \
        sys/sa.h \
        sys/sa_impl.h \
+       sys/sha2.h \
        sys/skein.h \
        sys/spa.h \
        sys/spa_checkpoint.h \
@@ -124,6 +125,7 @@ COMMON_H = \
        sys/zfs_delay.h \
        sys/zfs_file.h \
        sys/zfs_fuid.h \
+       sys/zfs_impl.h \
        sys/zfs_project.h \
        sys/zfs_quota.h \
        sys/zfs_racct.h \
diff --git a/include/sys/sha2.h b/include/sys/sha2.h
new file mode 100644 (file)
index 0000000..81dfbbb
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#ifndef        _SYS_SHA2_H
+#define        _SYS_SHA2_H
+
+#ifdef  _KERNEL
+#include <sys/types.h>
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define        SHA224_BLOCK_LENGTH             64
+#define        SHA256_BLOCK_LENGTH             64
+#define        SHA384_BLOCK_LENGTH             128
+#define        SHA512_BLOCK_LENGTH             128
+
+#define        SHA224_DIGEST_LENGTH            28
+#define        SHA256_DIGEST_LENGTH            32
+#define        SHA384_DIGEST_LENGTH            48
+#define        SHA512_DIGEST_LENGTH            64
+
+#define        SHA512_224_DIGEST_LENGTH        28
+#define        SHA512_256_DIGEST_LENGTH        32
+
+#define        SHA256_HMAC_BLOCK_SIZE          64
+#define        SHA512_HMAC_BLOCK_SIZE          128
+
+/* sha256 context */
+typedef struct {
+       uint32_t state[8];
+       uint64_t count[2];
+       uint8_t wbuf[64];
+
+       /* const sha256_ops_t *ops */
+       const void *ops;
+} sha256_ctx;
+
+/* sha512 context */
+typedef struct {
+       uint64_t state[8];
+       uint64_t count[2];
+       uint8_t wbuf[128];
+
+       /* const sha256_ops_t *ops */
+       const void *ops;
+} sha512_ctx;
+
+/* SHA2 context */
+typedef struct {
+       union {
+               sha256_ctx sha256;
+               sha512_ctx sha512;
+       };
+
+       /* algorithm type */
+       int algotype;
+} SHA2_CTX;
+
+/* SHA2 algorithm types */
+typedef enum sha2_mech_type {
+       SHA256_MECH_INFO_TYPE,          /* SUN_CKM_SHA256 */
+       SHA256_HMAC_MECH_INFO_TYPE,     /* SUN_CKM_SHA256_HMAC */
+       SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */
+       SHA384_MECH_INFO_TYPE,          /* SUN_CKM_SHA384 */
+       SHA384_HMAC_MECH_INFO_TYPE,     /* SUN_CKM_SHA384_HMAC */
+       SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */
+       SHA512_MECH_INFO_TYPE,          /* SUN_CKM_SHA512 */
+       SHA512_HMAC_MECH_INFO_TYPE,     /* SUN_CKM_SHA512_HMAC */
+       SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */
+       SHA512_224_MECH_INFO_TYPE,      /* SUN_CKM_SHA512_224 */
+       SHA512_256_MECH_INFO_TYPE       /* SUN_CKM_SHA512_256 */
+} sha2_mech_type_t;
+
+#define        SHA256                  0
+#define        SHA256_HMAC             1
+#define        SHA256_HMAC_GEN         2
+#define        SHA384                  3
+#define        SHA384_HMAC             4
+#define        SHA384_HMAC_GEN         5
+#define        SHA512                  6
+#define        SHA512_HMAC             7
+#define        SHA512_HMAC_GEN         8
+#define        SHA512_224              9
+#define        SHA512_256              10
+
+/* SHA2 Init function */
+extern void SHA2Init(int algotype, SHA2_CTX *ctx);
+
+/* SHA2 Update function */
+extern void SHA2Update(SHA2_CTX *ctx, const void *data, size_t len);
+
+/* SHA2 Final function */
+extern void SHA2Final(void *digest, SHA2_CTX *ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SYS_SHA2_H */
diff --git a/include/sys/zfs_impl.h b/include/sys/zfs_impl.h
new file mode 100644 (file)
index 0000000..df4899f
--- /dev/null
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#ifndef        _SYS_ZFS_IMPL_H
+#define        _SYS_ZFS_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* generic implementation backends */
+typedef struct
+{
+       /* algorithm name */
+       const char *name;
+
+       /* get number of supported implementations */
+       uint32_t (*getcnt)(void);
+
+       /* get id of selected implementation */
+       uint32_t (*getid)(void);
+
+       /* get name of selected implementation */
+       const char *(*getname)(void);
+
+       /* setup id as fastest implementation */
+       void (*set_fastest)(uint32_t id);
+
+       /* set implementation by id */
+       void (*setid)(uint32_t id);
+
+       /* set implementation by name */
+       int (*setname)(const char *val);
+} zfs_impl_t;
+
+/* return some set of function pointer */
+extern const zfs_impl_t *zfs_impl_get_ops(const char *algo);
+
+extern const zfs_impl_t zfs_blake3_ops;
+extern const zfs_impl_t zfs_sha256_ops;
+extern const zfs_impl_t zfs_sha512_ops;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_ZFS_IMPL_H */
index 5903678dfb417c90323cd91c06ab18107113d9e5..9fb79ab4a54b2d65fad6cbe0dbcd3572c5aaadfa 100644 (file)
@@ -110,9 +110,9 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t
  */
 
 /* SHA2 */
-extern zio_checksum_t abd_checksum_SHA256;
-extern zio_checksum_t abd_checksum_SHA512_native;
-extern zio_checksum_t abd_checksum_SHA512_byteswap;
+extern zio_checksum_t abd_checksum_sha256;
+extern zio_checksum_t abd_checksum_sha512_native;
+extern zio_checksum_t abd_checksum_sha512_byteswap;
 
 /* Skein */
 extern zio_checksum_t abd_checksum_skein_native;
index 7c6cf71de24281e5936b6876d24399ac510ec16c..4ba55b2158bc4b80e7f30211247867638fe8a114 100644 (file)
@@ -16,7 +16,6 @@ nodist_libicp_la_SOURCES = \
        module/icp/algs/blake3/blake3.c \
        module/icp/algs/blake3/blake3_generic.c \
        module/icp/algs/blake3/blake3_impl.c \
-       module/icp/algs/blake3/blake3_x86-64.c \
        module/icp/algs/edonr/edonr.c \
        module/icp/algs/modes/modes.c \
        module/icp/algs/modes/cbc.c \
@@ -26,7 +25,9 @@ nodist_libicp_la_SOURCES = \
        module/icp/algs/modes/ctr.c \
        module/icp/algs/modes/ccm.c \
        module/icp/algs/modes/ecb.c \
-       module/icp/algs/sha2/sha2.c \
+       module/icp/algs/sha2/sha2_generic.c \
+       module/icp/algs/sha2/sha256_impl.c \
+       module/icp/algs/sha2/sha512_impl.c \
        module/icp/algs/skein/skein.c \
        module/icp/algs/skein/skein_block.c \
        module/icp/algs/skein/skein_iv.c \
@@ -38,18 +39,31 @@ nodist_libicp_la_SOURCES = \
        module/icp/core/kcf_prov_lib.c \
        module/icp/core/kcf_callprov.c \
        module/icp/core/kcf_mech_tabs.c \
-       module/icp/core/kcf_prov_tabs.c
+       module/icp/core/kcf_prov_tabs.c \
+       module/zfs/zfs_impl.c
 
 if TARGET_CPU_AARCH64
 nodist_libicp_la_SOURCES += \
        module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
-       module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+       module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S \
+       module/icp/asm-aarch64/sha2/sha256-armv8.S \
+       module/icp/asm-aarch64/sha2/sha512-armv8.S
+endif
+
+if TARGET_CPU_ARM
+nodist_libicp_la_SOURCES += \
+       module/icp/asm-arm/sha2/sha256-armv7.S \
+       module/icp/asm-arm/sha2/sha512-armv7.S
 endif
 
 if TARGET_CPU_POWERPC
 nodist_libicp_la_SOURCES += \
        module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \
-       module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
+       module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S \
+       module/icp/asm-ppc64/sha2/sha256-ppc.S \
+       module/icp/asm-ppc64/sha2/sha512-ppc.S \
+       module/icp/asm-ppc64/sha2/sha256-p8.S \
+       module/icp/asm-ppc64/sha2/sha512-p8.S
 endif
 
 if TARGET_CPU_X86_64
@@ -60,8 +74,8 @@ nodist_libicp_la_SOURCES += \
        module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \
        module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \
        module/icp/asm-x86_64/modes/ghash-x86_64.S \
-       module/icp/asm-x86_64/sha2/sha256_impl.S \
-       module/icp/asm-x86_64/sha2/sha512_impl.S \
+       module/icp/asm-x86_64/sha2/sha256-x86_64.S \
+       module/icp/asm-x86_64/sha2/sha512-x86_64.S \
        module/icp/asm-x86_64/blake3/blake3_avx2.S \
        module/icp/asm-x86_64/blake3/blake3_avx512.S \
        module/icp/asm-x86_64/blake3/blake3_sse2.S \
index f5eb84679204535edd3eeb84c6cd51d1e9cc77bc..cffe341220c2293fe77e58fb06362e0dc1fdccef 100644 (file)
@@ -34,8 +34,6 @@ dist_libzfs_la_SOURCES += \
 endif
 
 nodist_libzfs_la_SOURCES = \
-       module/icp/algs/sha2/sha2.c \
-       \
        module/zcommon/cityhash.c \
        module/zcommon/zfeature_common.c \
        module/zcommon/zfs_comutil.c \
@@ -52,7 +50,6 @@ nodist_libzfs_la_SOURCES = \
        module/zcommon/zpool_prop.c \
        module/zcommon/zprop_common.c
 
-
 libzfs_la_LIBADD = \
        libshare.la \
        libzfs_core.la \
index 0cc1997f7a99b07ed175306c323003a436b31e90..0748f1240db980eaeca71a135527adc4aa03de99 100644 (file)
@@ -118,7 +118,7 @@ nodist_libzpool_la_SOURCES = \
        module/zfs/refcount.c \
        module/zfs/rrwlock.c \
        module/zfs/sa.c \
-       module/zfs/sha256.c \
+       module/zfs/sha2_zfs.c \
        module/zfs/skein_zfs.c \
        module/zfs/spa.c \
        module/zfs/spa_checkpoint.c \
index 6b1c9c48b1e6e2a26ffb2adb68d4a79dfdfacceb..21606b8cae2779b70e2b5c771832473d3b0ada96 100644 (file)
@@ -85,7 +85,6 @@ ICP_OBJS := \
        algs/blake3/blake3.o \
        algs/blake3/blake3_generic.o \
        algs/blake3/blake3_impl.o \
-       algs/blake3/blake3_x86-64.o \
        algs/edonr/edonr.o \
        algs/modes/cbc.o \
        algs/modes/ccm.o \
@@ -94,6 +93,9 @@ ICP_OBJS := \
        algs/modes/gcm.o \
        algs/modes/gcm_generic.o \
        algs/modes/modes.o \
+       algs/sha2/sha2_generic.o \
+       algs/sha2/sha256_impl.o \
+       algs/sha2/sha512_impl.o \
        algs/skein/skein.o \
        algs/skein/skein_block.o \
        algs/skein/skein_iv.o \
@@ -119,30 +121,40 @@ ICP_OBJS_X86_64 := \
        asm-x86_64/blake3/blake3_avx512.o \
        asm-x86_64/blake3/blake3_sse2.o \
        asm-x86_64/blake3/blake3_sse41.o \
+       asm-x86_64/sha2/sha256-x86_64.o \
+       asm-x86_64/sha2/sha512-x86_64.o \
        asm-x86_64/modes/aesni-gcm-x86_64.o \
        asm-x86_64/modes/gcm_pclmulqdq.o \
        asm-x86_64/modes/ghash-x86_64.o
 
-
 ICP_OBJS_X86 := \
        algs/aes/aes_impl_aesni.o \
        algs/aes/aes_impl_x86-64.o \
        algs/modes/gcm_pclmulqdq.o
 
+ICP_OBJS_ARM := \
+       asm-arm/sha2/sha256-armv7.o \
+       asm-arm/sha2/sha512-armv7.o
 
 ICP_OBJS_ARM64 := \
        asm-aarch64/blake3/b3_aarch64_sse2.o \
-       asm-aarch64/blake3/b3_aarch64_sse41.o
-
+       asm-aarch64/blake3/b3_aarch64_sse41.o \
+       asm-aarch64/sha2/sha256-armv8.o \
+       asm-aarch64/sha2/sha512-armv8.o
 
 ICP_OBJS_PPC_PPC64 := \
        asm-ppc64/blake3/b3_ppc64le_sse2.o \
-       asm-ppc64/blake3/b3_ppc64le_sse41.o
+       asm-ppc64/blake3/b3_ppc64le_sse41.o \
+       asm-ppc64/sha2/sha256-p8.o \
+       asm-ppc64/sha2/sha512-p8.o \
+       asm-ppc64/sha2/sha256-ppc.o \
+       asm-ppc64/sha2/sha512-ppc.o
 
 zfs-objs             += $(addprefix icp/,$(ICP_OBJS))
 zfs-$(CONFIG_X86)    += $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
+zfs-$(CONFIG_ARM)    += $(addprefix icp/,$(ICP_OBJS_ARM))
 zfs-$(CONFIG_ARM64)  += $(addprefix icp/,$(ICP_OBJS_ARM64))
 zfs-$(CONFIG_PPC)    += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64)  += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
@@ -156,6 +168,11 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
 # Suppress objtool "return with modified stack frame" warnings.
 OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
 
+# Suppress objtool "unsupported stack pointer realignment" warnings.
+# See #6950 for the reasoning.
+OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y
+OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y
+
 LUA_OBJS := \
        lapi.o \
        lauxlib.o \
@@ -382,6 +399,7 @@ ZFS_OBJS := \
        zfs_chksum.o \
        zfs_fm.o \
        zfs_fuid.o \
+       zfs_impl.o \
        zfs_ioctl.o \
        zfs_log.o \
        zfs_onexit.o \
index 1663dcec63c7491a717fef37345c70cbd67c5be5..6676787967796f4cce15b61cac5c7d1a449c0166 100644 (file)
@@ -13,10 +13,15 @@ KMOD=       openzfs
        ${SRCDIR}/lua \
        ${SRCDIR}/nvpair \
        ${SRCDIR}/icp/algs/blake3 \
+       ${SRCDIR}/icp/algs/edonr \
+       ${SRCDIR}/icp/algs/sha2 \
        ${SRCDIR}/icp/asm-aarch64/blake3 \
+       ${SRCDIR}/icp/asm-aarch64/sha2 \
+       ${SRCDIR}/icp/asm-arm/sha2 \
+       ${SRCDIR}/icp/asm-ppc64/sha2 \
        ${SRCDIR}/icp/asm-ppc64/blake3 \
        ${SRCDIR}/icp/asm-x86_64/blake3 \
-       ${SRCDIR}/icp/algs/edonr \
+       ${SRCDIR}/icp/asm-x86_64/sha2 \
        ${SRCDIR}/os/freebsd/spl \
        ${SRCDIR}/os/freebsd/zfs \
        ${SRCDIR}/unicode \
@@ -27,8 +32,6 @@ KMOD= openzfs
        ${SRCDIR}/zstd/lib/compress \
        ${SRCDIR}/zstd/lib/decompress
 
-
-
 CFLAGS+= -I${INCDIR}
 CFLAGS+= -I${INCDIR}/os/freebsd
 CFLAGS+= -I${INCDIR}/os/freebsd/spl
@@ -88,8 +91,7 @@ SRCS+=        edonr.c
 #icp/algs/blake3
 SRCS+= blake3.c \
        blake3_generic.c \
-       blake3_impl.c \
-       blake3_x86-64.c
+       blake3_impl.c
 
 #icp/asm-aarch64/blake3
 SRCS+= b3_aarch64_sse2.S \
@@ -105,6 +107,29 @@ SRCS+=     blake3_avx2.S \
        blake3_sse2.S \
        blake3_sse41.S
 
+#icp/algs/sha2
+SRCS+= sha2_generic.c \
+       sha256_impl.c \
+       sha512_impl.c
+
+#icp/asm-arm/sha2
+SRCS+= sha256-armv7.S \
+       sha512-armv7.S
+
+#icp/asm-aarch64/sha2
+SRCS+= sha256-armv8.S \
+       sha512-armv8.S
+
+#icp/asm-ppc64/sha2
+SRCS+= sha256-p8.S \
+       sha512-p8.S \
+       sha256-ppc.S \
+       sha512-ppc.S
+
+#icp/asm-x86_64/sha2
+SRCS+= sha256-x86_64.S \
+       sha512-x86_64.S
+
 #lua
 SRCS+= lapi.c \
        lauxlib.c \
@@ -320,6 +345,7 @@ SRCS+=      abd.c \
        zfs_file_os.c \
        zfs_fm.c \
        zfs_fuid.c \
+       zfs_impl.c \
        zfs_ioctl.c \
        zfs_log.c \
        zfs_onexit.c \
diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c
new file mode 100644 (file)
index 0000000..024cfb1
--- /dev/null
@@ -0,0 +1,299 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+#include <sys/simd.h>
+
+#include <sha2/sha2_impl.h>
+
+#define        TF(E, N) \
+       extern void E(uint32_t s[8], const void *, size_t); \
+       static inline void N(uint32_t s[8], const void *d, size_t b) { \
+       kfpu_begin(); E(s, d, b); kfpu_end(); \
+}
+
+/* some implementation is always okay */
+static inline boolean_t sha2_is_supported(void)
+{
+       return (B_TRUE);
+}
+
+#if defined(__x86_64)
+
+extern void zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_x64_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = zfs_sha256_transform_x64,
+       .name = "x64"
+};
+
+#if defined(HAVE_SSSE3)
+static boolean_t sha2_have_ssse3(void)
+{
+       return (kfpu_allowed() && zfs_ssse3_available());
+}
+
+TF(zfs_sha256_transform_ssse3, tf_sha256_ssse3);
+const sha256_ops_t sha256_ssse3_impl = {
+       .is_supported = sha2_have_ssse3,
+       .transform = tf_sha256_ssse3,
+       .name = "ssse3"
+};
+#endif
+
+#if defined(HAVE_AVX)
+static boolean_t sha2_have_avx(void)
+{
+       return (kfpu_allowed() && zfs_avx_available());
+}
+
+TF(zfs_sha256_transform_avx, tf_sha256_avx);
+const sha256_ops_t sha256_avx_impl = {
+       .is_supported = sha2_have_avx,
+       .transform = tf_sha256_avx,
+       .name = "avx"
+};
+#endif
+
+#if defined(HAVE_AVX2)
+static boolean_t sha2_have_avx2(void)
+{
+       return (kfpu_allowed() && zfs_avx2_available());
+}
+
+TF(zfs_sha256_transform_avx2, tf_sha256_avx2);
+const sha256_ops_t sha256_avx2_impl = {
+       .is_supported = sha2_have_avx2,
+       .transform = tf_sha256_avx2,
+       .name = "avx2"
+};
+#endif
+
+#if defined(HAVE_SSE4_1)
+static boolean_t sha2_have_shani(void)
+{
+       return (kfpu_allowed() && zfs_sse4_1_available() && \
+           zfs_shani_available());
+}
+
+TF(zfs_sha256_transform_shani, tf_sha256_shani);
+const sha256_ops_t sha256_shani_impl = {
+       .is_supported = sha2_have_shani,
+       .transform = tf_sha256_shani,
+       .name = "shani"
+};
+#endif
+
+#elif defined(__aarch64__) || defined(__arm__)
+static boolean_t sha256_have_neon(void)
+{
+       return (kfpu_allowed() && zfs_neon_available());
+}
+
+static boolean_t sha256_have_armv8ce(void)
+{
+       return (kfpu_allowed() && zfs_sha256_available());
+}
+
+extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_armv7_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = zfs_sha256_block_armv7,
+       .name = "armv7"
+};
+
+TF(zfs_sha256_block_neon, tf_sha256_neon);
+const sha256_ops_t sha256_neon_impl = {
+       .is_supported = sha256_have_neon,
+       .transform = tf_sha256_neon,
+       .name = "neon"
+};
+
+TF(zfs_sha256_block_armv8, tf_sha256_armv8ce);
+const sha256_ops_t sha256_armv8_impl = {
+       .is_supported = sha256_have_armv8ce,
+       .transform = tf_sha256_armv8ce,
+       .name = "armv8-ce"
+};
+
+#elif defined(__PPC64__)
+static boolean_t sha256_have_vsx(void)
+{
+       return (kfpu_allowed() && zfs_vsx_available());
+}
+
+TF(zfs_sha256_ppc, tf_sha256_ppc);
+const sha256_ops_t sha256_ppc_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = tf_sha256_ppc,
+       .name = "ppc"
+};
+
+TF(zfs_sha256_power8, tf_sha256_power8);
+const sha256_ops_t sha256_power8_impl = {
+       .is_supported = sha256_have_vsx,
+       .transform = tf_sha256_power8,
+       .name = "power8"
+};
+#endif /* __PPC64__ */
+
+/* the two generic ones */
+extern const sha256_ops_t sha256_generic_impl;
+
+/* array with all sha256 implementations */
+static const sha256_ops_t *const sha256_impls[] = {
+       &sha256_generic_impl,
+#if defined(__x86_64)
+       &sha256_x64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSSE3)
+       &sha256_ssse3_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX)
+       &sha256_avx_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2)
+       &sha256_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSE4_1)
+       &sha256_shani_impl,
+#endif
+#if defined(__aarch64__) || defined(__arm__)
+       &sha256_armv7_impl,
+       &sha256_neon_impl,
+       &sha256_armv8_impl,
+#endif
+#if defined(__PPC64__)
+       &sha256_ppc_impl,
+       &sha256_power8_impl,
+#endif /* __PPC64__ */
+};
+
+/* use the generic implementation functions */
+#define        IMPL_NAME               "sha256"
+#define        IMPL_OPS_T              sha256_ops_t
+#define        IMPL_ARRAY              sha256_impls
+#define        IMPL_GET_OPS            sha256_get_ops
+#define        ZFS_IMPL_OPS            zfs_sha256_ops
+#include <generic_impl.c>
+
+#ifdef _KERNEL
+
+#define        IMPL_FMT(impl, i)       (((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+sha256_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+       const uint32_t impl = IMPL_READ(generic_impl_chosen);
+       char *fmt;
+       int cnt = 0;
+
+       /* cycling */
+       fmt = IMPL_FMT(impl, IMPL_CYCLE);
+       cnt += sprintf(buffer + cnt, fmt, "cycle");
+
+       /* list fastest */
+       fmt = IMPL_FMT(impl, IMPL_FASTEST);
+       cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+       /* list all supported implementations */
+       generic_impl_init();
+       for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+               fmt = IMPL_FMT(impl, i);
+               cnt += sprintf(buffer + cnt, fmt,
+                   generic_supp_impls[i]->name);
+       }
+
+       return (cnt);
+}
+
+static int
+sha256_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+       (void) unused;
+       return (generic_impl_setname(val));
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/sbuf.h>
+
+static int
+sha256_param(ZFS_MODULE_PARAM_ARGS)
+{
+       int err;
+
+       generic_impl_init();
+       if (req->newptr == NULL) {
+               const uint32_t impl = IMPL_READ(generic_impl_chosen);
+               const int init_buflen = 64;
+               const char *fmt;
+               struct sbuf *s;
+
+               s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+               /* cycling */
+               fmt = IMPL_FMT(impl, IMPL_CYCLE);
+               (void) sbuf_printf(s, fmt, "cycle");
+
+               /* list fastest */
+               fmt = IMPL_FMT(impl, IMPL_FASTEST);
+               (void) sbuf_printf(s, fmt, "fastest");
+
+               /* list all supported implementations */
+               for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+                       fmt = IMPL_FMT(impl, i);
+                       (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
+               }
+
+               err = sbuf_finish(s);
+               sbuf_delete(s);
+
+               return (err);
+       }
+
+       char buf[16];
+
+       err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+       if (err) {
+               return (err);
+       }
+
+       return (-generic_impl_setname(buf));
+}
+#endif
+
+#undef IMPL_FMT
+
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl,
+    sha256_param_set, sha256_param_get, ZMOD_RW, \
+       "Select SHA256 implementation.");
+#endif
+
+#undef TF
diff --git a/module/icp/algs/sha2/sha2_generic.c b/module/icp/algs/sha2/sha2_generic.c
new file mode 100644 (file)
index 0000000..e69dc77
--- /dev/null
@@ -0,0 +1,562 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on public domain code in cppcrypto 0.10.
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+
+#include <sha2/sha2_impl.h>
+
+/*
+ * On i386, gcc brings this for sha512_generic():
+ * error: the frame size of 1040 bytes is larger than 1024
+ */
+#if defined(__GNUC__) && defined(_ILP32)
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+/* SHA256 */
+static const uint32_t SHA256_K[64] = {
+       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define        Ch(x, y, z)     ((z) ^ ((x) & ((y) ^ (z))))
+#define        Maj(x, y, z)    (((y) & (z)) | (((y) | (z)) & (x)))
+
+#define        rotr32(x, n)    (((x) >> n) | ((x) << (32 - n)))
+#define        sum0(x)         (rotr32((x),  2) ^ rotr32((x), 13) ^ rotr32((x), 22))
+#define        sum1(x)         (rotr32((x),  6) ^ rotr32((x), 11) ^ rotr32((x), 25))
+#define        sigma0(x)       (rotr32((x),  7) ^ rotr32((x), 18) ^ ((x) >> 3))
+#define        sigma1(x)       (rotr32((x), 17) ^ rotr32((x), 19) ^ ((x) >> 10))
+
+#define        WU(j) (W[j & 15] += sigma1(W[(j + 14) & 15]) \
+       + W[(j + 9) & 15] + sigma0(W[(j + 1) & 15]))
+
+#define        COMPRESS(i, j, K) \
+       T1 = h + sum1(e) + Ch(e, f, g) + K[i + j] + (i? WU(j): W[j]); \
+       T2 = sum0(a) + Maj(a, b, c); \
+       h = g, g = f, f = e, e = d + T1; \
+       d = c, c = b, b = a, a = T1 + T2;
+
+static void sha256_generic(uint32_t state[8], const void *data, size_t num_blks)
+{
+       uint64_t blk;
+
+       for (blk = 0; blk < num_blks; blk++) {
+               uint32_t W[16];
+               uint32_t a, b, c, d, e, f, g, h;
+               uint32_t T1, T2;
+               int i;
+
+               for (i = 0; i < 16; i++) {
+                       W[i] = BE_32( \
+                           (((const uint32_t *)(data))[blk * 16 + i]));
+               }
+
+               a = state[0];
+               b = state[1];
+               c = state[2];
+               d = state[3];
+               e = state[4];
+               f = state[5];
+               g = state[6];
+               h = state[7];
+
+               for (i = 0; i <= 63; i += 16) {
+                       COMPRESS(i, 0, SHA256_K);
+                       COMPRESS(i, 1, SHA256_K);
+                       COMPRESS(i, 2, SHA256_K);
+                       COMPRESS(i, 3, SHA256_K);
+                       COMPRESS(i, 4, SHA256_K);
+                       COMPRESS(i, 5, SHA256_K);
+                       COMPRESS(i, 6, SHA256_K);
+                       COMPRESS(i, 7, SHA256_K);
+                       COMPRESS(i, 8, SHA256_K);
+                       COMPRESS(i, 9, SHA256_K);
+                       COMPRESS(i, 10, SHA256_K);
+                       COMPRESS(i, 11, SHA256_K);
+                       COMPRESS(i, 12, SHA256_K);
+                       COMPRESS(i, 13, SHA256_K);
+                       COMPRESS(i, 14, SHA256_K);
+                       COMPRESS(i, 15, SHA256_K);
+               }
+
+               state[0] += a;
+               state[1] += b;
+               state[2] += c;
+               state[3] += d;
+               state[4] += e;
+               state[5] += f;
+               state[6] += g;
+               state[7] += h;
+       }
+}
+
+#undef sum0
+#undef sum1
+#undef sigma0
+#undef sigma1
+
+#define        rotr64(x, n)    (((x) >> n) | ((x) << (64 - n)))
+#define        sum0(x)         (rotr64((x), 28) ^ rotr64((x), 34) ^ rotr64((x), 39))
+#define        sum1(x)         (rotr64((x), 14) ^ rotr64((x), 18) ^ rotr64((x), 41))
+#define        sigma0(x)       (rotr64((x),  1) ^ rotr64((x),  8) ^ ((x) >> 7))
+#define        sigma1(x)       (rotr64((x), 19) ^ rotr64((x), 61) ^ ((x) >> 6))
+
+/* SHA512 */
+static const uint64_t SHA512_K[80] = {
+       0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f,
+       0xe9b5dba58189dbbc, 0x3956c25bf348b538, 0x59f111f1b605d019,
+       0x923f82a4af194f9b, 0xab1c5ed5da6d8118, 0xd807aa98a3030242,
+       0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2,
+       0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235,
+       0xc19bf174cf692694, 0xe49b69c19ef14ad2, 0xefbe4786384f25e3,
+       0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65, 0x2de92c6f592b0275,
+       0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5,
+       0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f,
+       0xbf597fc7beef0ee4, 0xc6e00bf33da88fc2, 0xd5a79147930aa725,
+       0x06ca6351e003826f, 0x142929670a0e6e70, 0x27b70a8546d22ffc,
+       0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df,
+       0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6,
+       0x92722c851482353b, 0xa2bfe8a14cf10364, 0xa81a664bbc423001,
+       0xc24b8b70d0f89791, 0xc76c51a30654be30, 0xd192e819d6ef5218,
+       0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8,
+       0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99,
+       0x34b0bcb5e19b48a8, 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb,
+       0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3, 0x748f82ee5defb2fc,
+       0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec,
+       0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915,
+       0xc67178f2e372532b, 0xca273eceea26619c, 0xd186b8c721c0c207,
+       0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178, 0x06f067aa72176fba,
+       0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b,
+       0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc,
+       0x431d67c49c100d4c, 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a,
+       0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+};
+
+static void sha512_generic(uint64_t state[8], const void *data, size_t num_blks)
+{
+       uint64_t blk;
+
+       for (blk = 0; blk < num_blks; blk++) {
+               uint64_t W[16];
+               uint64_t a, b, c, d, e, f, g, h;
+               uint64_t T1, T2;
+               int i;
+
+               for (i = 0; i < 16; i++) {
+                       W[i] = BE_64( \
+                           (((const uint64_t *)(data))[blk * 16 + i]));
+               }
+
+               a = state[0];
+               b = state[1];
+               c = state[2];
+               d = state[3];
+               e = state[4];
+               f = state[5];
+               g = state[6];
+               h = state[7];
+
+               for (i = 0; i <= 79; i += 16) {
+                       COMPRESS(i, 0, SHA512_K);
+                       COMPRESS(i, 1, SHA512_K);
+                       COMPRESS(i, 2, SHA512_K);
+                       COMPRESS(i, 3, SHA512_K);
+                       COMPRESS(i, 4, SHA512_K);
+                       COMPRESS(i, 5, SHA512_K);
+                       COMPRESS(i, 6, SHA512_K);
+                       COMPRESS(i, 7, SHA512_K);
+                       COMPRESS(i, 8, SHA512_K);
+                       COMPRESS(i, 9, SHA512_K);
+                       COMPRESS(i, 10, SHA512_K);
+                       COMPRESS(i, 11, SHA512_K);
+                       COMPRESS(i, 12, SHA512_K);
+                       COMPRESS(i, 13, SHA512_K);
+                       COMPRESS(i, 14, SHA512_K);
+                       COMPRESS(i, 15, SHA512_K);
+               }
+               state[0] += a;
+               state[1] += b;
+               state[2] += c;
+               state[3] += d;
+               state[4] += e;
+               state[5] += f;
+               state[6] += g;
+               state[7] += h;
+       }
+}
+
+static void sha256_update(sha256_ctx *ctx, const uint8_t *data, size_t len)
+{
+       uint64_t pos = ctx->count[0];
+       uint64_t total = ctx->count[1];
+       uint8_t *m = ctx->wbuf;
+       const sha256_ops_t *ops = ctx->ops;
+
+       if (pos && pos + len >= 64) {
+               memcpy(m + pos, data, 64 - pos);
+               ops->transform(ctx->state, m, 1);
+               len -= 64 - pos;
+               total += (64 - pos) * 8;
+               data += 64 - pos;
+               pos = 0;
+       }
+
+       if (len >= 64) {
+               uint32_t blocks = len / 64;
+               uint32_t bytes = blocks * 64;
+               ops->transform(ctx->state, data, blocks);
+               len -= bytes;
+               total += (bytes) * 8;
+               data += bytes;
+       }
+       memcpy(m + pos, data, len);
+
+       pos += len;
+       total += len * 8;
+       ctx->count[0] = pos;
+       ctx->count[1] = total;
+}
+
+static void sha512_update(sha512_ctx *ctx, const uint8_t *data, size_t len)
+{
+       uint64_t pos = ctx->count[0];
+       uint64_t total = ctx->count[1];
+       uint8_t *m = ctx->wbuf;
+       const sha512_ops_t *ops = ctx->ops;
+
+       if (pos && pos + len >= 128) {
+               memcpy(m + pos, data, 128 - pos);
+               ops->transform(ctx->state, m, 1);
+               len -= 128 - pos;
+               total += (128 - pos) * 8;
+               data += 128 - pos;
+               pos = 0;
+       }
+
+       if (len >= 128) {
+               uint64_t blocks = len / 128;
+               uint64_t bytes = blocks * 128;
+               ops->transform(ctx->state, data, blocks);
+               len -= bytes;
+               total += (bytes) * 8;
+               data += bytes;
+       }
+       memcpy(m + pos, data, len);
+
+       pos += len;
+       total += len * 8;
+       ctx->count[0] = pos;
+       ctx->count[1] = total;
+}
+
+static void sha256_final(sha256_ctx *ctx, uint8_t *result, int bits)
+{
+       uint64_t mlen, pos = ctx->count[0];
+       uint8_t *m = ctx->wbuf;
+       uint32_t *R = (uint32_t *)result;
+       const sha256_ops_t *ops = ctx->ops;
+
+       m[pos++] = 0x80;
+       if (pos > 56) {
+               memset(m + pos, 0, 64 - pos);
+               ops->transform(ctx->state, m, 1);
+               pos = 0;
+       }
+
+       memset(m + pos, 0, 64 - pos);
+       mlen = BE_64(ctx->count[1]);
+       memcpy(m + (64 - 8), &mlen, 64 / 8);
+       ops->transform(ctx->state, m, 1);
+
+       switch (bits) {
+       case 224: /* 28 - unused currently /TR */
+               R[0] = BE_32(ctx->state[0]);
+               R[1] = BE_32(ctx->state[1]);
+               R[2] = BE_32(ctx->state[2]);
+               R[3] = BE_32(ctx->state[3]);
+               R[4] = BE_32(ctx->state[4]);
+               R[5] = BE_32(ctx->state[5]);
+               R[6] = BE_32(ctx->state[6]);
+               break;
+       case 256: /* 32 */
+               R[0] = BE_32(ctx->state[0]);
+               R[1] = BE_32(ctx->state[1]);
+               R[2] = BE_32(ctx->state[2]);
+               R[3] = BE_32(ctx->state[3]);
+               R[4] = BE_32(ctx->state[4]);
+               R[5] = BE_32(ctx->state[5]);
+               R[6] = BE_32(ctx->state[6]);
+               R[7] = BE_32(ctx->state[7]);
+               break;
+       }
+
+       memset(ctx, 0, sizeof (*ctx));
+}
+
+static void sha512_final(sha512_ctx *ctx, uint8_t *result, int bits)
+{
+       uint64_t mlen, pos = ctx->count[0];
+       uint8_t *m = ctx->wbuf, *r;
+       uint64_t *R = (uint64_t *)result;
+       const sha512_ops_t *ops = ctx->ops;
+
+       m[pos++] = 0x80;
+       if (pos > 112) {
+               memset(m + pos, 0, 128 - pos);
+               ops->transform(ctx->state, m, 1);
+               pos = 0;
+       }
+
+       memset(m + pos, 0, 128 - pos);
+       mlen = BE_64(ctx->count[1]);
+       memcpy(m + (128 - 8), &mlen, 64 / 8);
+       ops->transform(ctx->state, m, 1);
+
+       switch (bits) {
+       case 224: /* 28 => 3,5 x 8 */
+               r = result + 24;
+               R[0] = BE_64(ctx->state[0]);
+               R[1] = BE_64(ctx->state[1]);
+               R[2] = BE_64(ctx->state[2]);
+               /* last 4 bytes are special here */
+               *r++ = (uint8_t)(ctx->state[3] >> 56);
+               *r++ = (uint8_t)(ctx->state[3] >> 48);
+               *r++ = (uint8_t)(ctx->state[3] >> 40);
+               *r++ = (uint8_t)(ctx->state[3] >> 32);
+               break;
+       case 256: /* 32 */
+               R[0] = BE_64(ctx->state[0]);
+               R[1] = BE_64(ctx->state[1]);
+               R[2] = BE_64(ctx->state[2]);
+               R[3] = BE_64(ctx->state[3]);
+               break;
+       case 384: /* 48 */
+               R[0] = BE_64(ctx->state[0]);
+               R[1] = BE_64(ctx->state[1]);
+               R[2] = BE_64(ctx->state[2]);
+               R[3] = BE_64(ctx->state[3]);
+               R[4] = BE_64(ctx->state[4]);
+               R[5] = BE_64(ctx->state[5]);
+               break;
+       case 512: /* 64 */
+               R[0] = BE_64(ctx->state[0]);
+               R[1] = BE_64(ctx->state[1]);
+               R[2] = BE_64(ctx->state[2]);
+               R[3] = BE_64(ctx->state[3]);
+               R[4] = BE_64(ctx->state[4]);
+               R[5] = BE_64(ctx->state[5]);
+               R[6] = BE_64(ctx->state[6]);
+               R[7] = BE_64(ctx->state[7]);
+               break;
+       }
+
+       memset(ctx, 0, sizeof (*ctx));
+}
+
+/* SHA2 Init function */
+void
+SHA2Init(int algotype, SHA2_CTX *ctx)
+{
+       sha256_ctx *ctx256 = &ctx->sha256;
+       sha512_ctx *ctx512 = &ctx->sha512;
+
+       ASSERT3U(algotype, >=, SHA256_MECH_INFO_TYPE);
+       ASSERT3U(algotype, <=, SHA512_256_MECH_INFO_TYPE);
+
+       memset(ctx, 0, sizeof (*ctx));
+       ctx->algotype = algotype;
+       switch (ctx->algotype) {
+               case SHA256_MECH_INFO_TYPE:
+               case SHA256_HMAC_MECH_INFO_TYPE:
+               case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+                       ctx256->state[0] = 0x6a09e667;
+                       ctx256->state[1] = 0xbb67ae85;
+                       ctx256->state[2] = 0x3c6ef372;
+                       ctx256->state[3] = 0xa54ff53a;
+                       ctx256->state[4] = 0x510e527f;
+                       ctx256->state[5] = 0x9b05688c;
+                       ctx256->state[6] = 0x1f83d9ab;
+                       ctx256->state[7] = 0x5be0cd19;
+                       ctx256->count[0] = 0;
+                       ctx256->ops = sha256_get_ops();
+                       break;
+               case SHA384_MECH_INFO_TYPE:
+               case SHA384_HMAC_MECH_INFO_TYPE:
+               case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+                       ctx512->state[0] = 0xcbbb9d5dc1059ed8ULL;
+                       ctx512->state[1] = 0x629a292a367cd507ULL;
+                       ctx512->state[2] = 0x9159015a3070dd17ULL;
+                       ctx512->state[3] = 0x152fecd8f70e5939ULL;
+                       ctx512->state[4] = 0x67332667ffc00b31ULL;
+                       ctx512->state[5] = 0x8eb44a8768581511ULL;
+                       ctx512->state[6] = 0xdb0c2e0d64f98fa7ULL;
+                       ctx512->state[7] = 0x47b5481dbefa4fa4ULL;
+                       ctx512->count[0] = 0;
+                       ctx512->count[1] = 0;
+                       ctx512->ops = sha512_get_ops();
+                       break;
+               case SHA512_MECH_INFO_TYPE:
+               case SHA512_HMAC_MECH_INFO_TYPE:
+               case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+                       ctx512->state[0] = 0x6a09e667f3bcc908ULL;
+                       ctx512->state[1] = 0xbb67ae8584caa73bULL;
+                       ctx512->state[2] = 0x3c6ef372fe94f82bULL;
+                       ctx512->state[3] = 0xa54ff53a5f1d36f1ULL;
+                       ctx512->state[4] = 0x510e527fade682d1ULL;
+                       ctx512->state[5] = 0x9b05688c2b3e6c1fULL;
+                       ctx512->state[6] = 0x1f83d9abfb41bd6bULL;
+                       ctx512->state[7] = 0x5be0cd19137e2179ULL;
+                       ctx512->count[0] = 0;
+                       ctx512->count[1] = 0;
+                       ctx512->ops = sha512_get_ops();
+                       break;
+               case SHA512_224_MECH_INFO_TYPE:
+                       ctx512->state[0] = 0x8c3d37c819544da2ULL;
+                       ctx512->state[1] = 0x73e1996689dcd4d6ULL;
+                       ctx512->state[2] = 0x1dfab7ae32ff9c82ULL;
+                       ctx512->state[3] = 0x679dd514582f9fcfULL;
+                       ctx512->state[4] = 0x0f6d2b697bd44da8ULL;
+                       ctx512->state[5] = 0x77e36f7304c48942ULL;
+                       ctx512->state[6] = 0x3f9d85a86a1d36c8ULL;
+                       ctx512->state[7] = 0x1112e6ad91d692a1ULL;
+                       ctx512->count[0] = 0;
+                       ctx512->count[1] = 0;
+                       ctx512->ops = sha512_get_ops();
+                       break;
+               case SHA512_256_MECH_INFO_TYPE:
+                       ctx512->state[0] = 0x22312194fc2bf72cULL;
+                       ctx512->state[1] = 0x9f555fa3c84c64c2ULL;
+                       ctx512->state[2] = 0x2393b86b6f53b151ULL;
+                       ctx512->state[3] = 0x963877195940eabdULL;
+                       ctx512->state[4] = 0x96283ee2a88effe3ULL;
+                       ctx512->state[5] = 0xbe5e1e2553863992ULL;
+                       ctx512->state[6] = 0x2b0199fc2c85b8aaULL;
+                       ctx512->state[7] = 0x0eb72ddc81c52ca2ULL;
+                       ctx512->count[0] = 0;
+                       ctx512->count[1] = 0;
+                       ctx512->ops = sha512_get_ops();
+                       break;
+       }
+}
+
+/* SHA2 Update function */
+void
+SHA2Update(SHA2_CTX *ctx, const void *data, size_t len)
+{
+       /* check for zero input length */
+       if (len == 0)
+               return;
+
+       ASSERT3P(data, !=, NULL);
+
+       switch (ctx->algotype) {
+               case SHA256_MECH_INFO_TYPE:
+               case SHA256_HMAC_MECH_INFO_TYPE:
+               case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+                       sha256_update(&ctx->sha256, data, len);
+                       break;
+               case SHA384_MECH_INFO_TYPE:
+               case SHA384_HMAC_MECH_INFO_TYPE:
+               case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+                       sha512_update(&ctx->sha512, data, len);
+                       break;
+               case SHA512_MECH_INFO_TYPE:
+               case SHA512_HMAC_MECH_INFO_TYPE:
+               case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+                       sha512_update(&ctx->sha512, data, len);
+                       break;
+               case SHA512_224_MECH_INFO_TYPE:
+                       sha512_update(&ctx->sha512, data, len);
+                       break;
+               case SHA512_256_MECH_INFO_TYPE:
+                       sha512_update(&ctx->sha512, data, len);
+                       break;
+       }
+}
+
+/* SHA2Final function */
+void
+SHA2Final(void *digest, SHA2_CTX *ctx)
+{
+       switch (ctx->algotype) {
+               case SHA256_MECH_INFO_TYPE:
+               case SHA256_HMAC_MECH_INFO_TYPE:
+               case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+                       sha256_final(&ctx->sha256, digest, 256);
+                       break;
+               case SHA384_MECH_INFO_TYPE:
+               case SHA384_HMAC_MECH_INFO_TYPE:
+               case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+                       sha512_final(&ctx->sha512, digest, 384);
+                       break;
+               case SHA512_MECH_INFO_TYPE:
+               case SHA512_HMAC_MECH_INFO_TYPE:
+               case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+                       sha512_final(&ctx->sha512, digest, 512);
+                       break;
+               case SHA512_224_MECH_INFO_TYPE:
+                       sha512_final(&ctx->sha512, digest, 224);
+                       break;
+               case SHA512_256_MECH_INFO_TYPE:
+                       sha512_final(&ctx->sha512, digest, 256);
+                       break;
+       }
+}
+
+/* the generic implementation is always okay */
+static boolean_t sha2_is_supported(void)
+{
+       return (B_TRUE);
+}
+
+const sha256_ops_t sha256_generic_impl = {
+       .name = "generic",
+       .transform = sha256_generic,
+       .is_supported = sha2_is_supported
+};
+
+const sha512_ops_t sha512_generic_impl = {
+       .name = "generic",
+       .transform = sha512_generic,
+       .is_supported = sha2_is_supported
+};
diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c
new file mode 100644 (file)
index 0000000..d213123
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+#include <sys/sha2.h>
+#include <sys/simd.h>
+
+#include <sha2/sha2_impl.h>
+
+#define        TF(E, N) \
+       extern void E(uint64_t s[8], const void *, size_t); \
+       static inline void N(uint64_t s[8], const void *d, size_t b) { \
+       kfpu_begin(); E(s, d, b); kfpu_end(); \
+}
+
+/* some implementation is always okay */
+static inline boolean_t sha2_is_supported(void)
+{
+       return (B_TRUE);
+}
+
+#if defined(__x86_64)
+
+extern void zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_x64_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = zfs_sha512_transform_x64,
+       .name = "x64"
+};
+
+#if defined(HAVE_AVX)
+static boolean_t sha2_have_avx(void)
+{
+       return (kfpu_allowed() && zfs_avx_available());
+}
+
+TF(zfs_sha512_transform_avx, tf_sha512_avx);
+const sha512_ops_t sha512_avx_impl = {
+       .is_supported = sha2_have_avx,
+       .transform = tf_sha512_avx,
+       .name = "avx"
+};
+#endif
+
+#if defined(HAVE_AVX2)
+static boolean_t sha2_have_avx2(void)
+{
+       return (kfpu_allowed() && zfs_avx2_available());
+}
+
+TF(zfs_sha512_transform_avx2, tf_sha512_avx2);
+const sha512_ops_t sha512_avx2_impl = {
+       .is_supported = sha2_have_avx2,
+       .transform = tf_sha512_avx2,
+       .name = "avx2"
+};
+#endif
+
+#elif defined(__aarch64__)
+extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_armv7_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = zfs_sha512_block_armv7,
+       .name = "armv7"
+};
+
+static boolean_t sha512_have_armv8ce(void)
+{
+       return (kfpu_allowed() && zfs_sha512_available());
+}
+
+TF(zfs_sha512_block_armv8, tf_sha512_armv8ce);
+const sha512_ops_t sha512_armv8_impl = {
+       .is_supported = sha512_have_armv8ce,
+       .transform = tf_sha512_armv8ce,
+       .name = "armv8-ce"
+};
+
+#elif defined(__arm__)
+extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
+const sha512_ops_t sha512_armv7_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = zfs_sha512_block_armv7,
+       .name = "armv7"
+};
+
+static boolean_t sha512_have_neon(void)
+{
+       return (kfpu_allowed() && zfs_neon_available());
+}
+
+TF(zfs_sha512_block_neon, tf_sha512_neon);
+const sha512_ops_t sha512_neon_impl = {
+       .is_supported = sha512_have_neon,
+       .transform = tf_sha512_neon,
+       .name = "neon"
+};
+
+#elif defined(__PPC64__)
+TF(zfs_sha512_ppc, tf_sha512_ppc);
+const sha512_ops_t sha512_ppc_impl = {
+       .is_supported = sha2_is_supported,
+       .transform = tf_sha512_ppc,
+       .name = "ppc"
+};
+
+static boolean_t sha512_have_vsx(void)
+{
+       return (kfpu_allowed() && zfs_vsx_available());
+}
+
+TF(zfs_sha512_power8, tf_sha512_power8);
+const sha512_ops_t sha512_power8_impl = {
+       .is_supported = sha512_have_vsx,
+       .transform = tf_sha512_power8,
+       .name = "power8"
+};
+#endif /* __PPC64__ */
+
+/* the two generic ones */
+extern const sha512_ops_t sha512_generic_impl;
+
+/* array with all sha512 implementations */
+static const sha512_ops_t *const sha512_impls[] = {
+       &sha512_generic_impl,
+#if defined(__x86_64)
+       &sha512_x64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX)
+       &sha512_avx_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2)
+       &sha512_avx2_impl,
+#endif
+#if defined(__aarch64__)
+       &sha512_armv7_impl,
+       &sha512_armv8_impl,
+#endif
+#if defined(__arm__)
+       &sha512_armv7_impl,
+       &sha512_neon_impl,
+#endif
+#if defined(__PPC64__)
+       &sha512_ppc_impl,
+       &sha512_power8_impl,
+#endif /* __PPC64__ */
+};
+
+/* use the generic implementation functions */
+#define        IMPL_NAME               "sha512"
+#define        IMPL_OPS_T              sha512_ops_t
+#define        IMPL_ARRAY              sha512_impls
+#define        IMPL_GET_OPS            sha512_get_ops
+#define        ZFS_IMPL_OPS            zfs_sha512_ops
+#include <generic_impl.c>
+
+#ifdef _KERNEL
+
+#define        IMPL_FMT(impl, i)       (((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+sha512_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+       const uint32_t impl = IMPL_READ(generic_impl_chosen);
+       char *fmt;
+       int cnt = 0;
+
+       /* cycling */
+       fmt = IMPL_FMT(impl, IMPL_CYCLE);
+       cnt += sprintf(buffer + cnt, fmt, "cycle");
+
+       /* list fastest */
+       fmt = IMPL_FMT(impl, IMPL_FASTEST);
+       cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+       /* list all supported implementations */
+       generic_impl_init();
+       for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+               fmt = IMPL_FMT(impl, i);
+               cnt += sprintf(buffer + cnt, fmt,
+                   generic_supp_impls[i]->name);
+       }
+
+       return (cnt);
+}
+
+static int
+sha512_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+       (void) unused;
+       return (generic_impl_setname(val));
+}
+
+#elif defined(__FreeBSD__)
+
+#include <sys/sbuf.h>
+
+static int
+sha512_param(ZFS_MODULE_PARAM_ARGS)
+{
+       int err;
+
+       generic_impl_init();
+       if (req->newptr == NULL) {
+               const uint32_t impl = IMPL_READ(generic_impl_chosen);
+               const int init_buflen = 64;
+               const char *fmt;
+               struct sbuf *s;
+
+               s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+               /* cycling */
+               fmt = IMPL_FMT(impl, IMPL_CYCLE);
+               (void) sbuf_printf(s, fmt, "cycle");
+
+               /* list fastest */
+               fmt = IMPL_FMT(impl, IMPL_FASTEST);
+               (void) sbuf_printf(s, fmt, "fastest");
+
+               /* list all supported implementations */
+               for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
+                       fmt = IMPL_FMT(impl, i);
+                       (void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
+               }
+
+               err = sbuf_finish(s);
+               sbuf_delete(s);
+
+               return (err);
+       }
+
+       /* we got module parameter */
+       char buf[16];
+
+       err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+       if (err) {
+               return (err);
+       }
+
+       return (-generic_impl_setname(buf));
+}
+#endif
+
+#undef IMPL_FMT
+
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl,
+    sha512_param_set, sha512_param_get, ZMOD_RW, \
+       "Select SHA512 implementation.");
+#endif
+
+#undef TF
diff --git a/module/icp/asm-aarch64/sha2/sha256-armv8.S b/module/icp/asm-aarch64/sha2/sha256-armv8.S
new file mode 100644 (file)
index 0000000..fa50c4e
--- /dev/null
@@ -0,0 +1,1999 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__aarch64__)
+
+.text
+
+.align 6
+.type  .LK256,%object
+.LK256:
+       .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+       .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+       .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+       .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+       .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+       .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+       .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+       .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+       .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+       .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+       .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+       .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+       .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+       .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+       .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+       .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+       .long   0       //terminator
+.size  .LK256,.-.LK256
+
+.globl zfs_sha256_block_armv7
+.type  zfs_sha256_block_armv7,%function
+.align 6
+zfs_sha256_block_armv7:
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*4
+
+       ldp     w20,w21,[x0]                            // load context
+       ldp     w22,w23,[x0,#2*4]
+       ldp     w24,w25,[x0,#4*4]
+       add     x2,x1,x2,lsl#6  // end of input
+       ldp     w26,w27,[x0,#6*4]
+       adr     x30,.LK256
+       stp     x0,x2,[x29,#96]
+
+.Loop:
+       ldp     w3,w4,[x1],#2*4
+       ldr     w19,[x30],#4                    // *K++
+       eor     w28,w21,w22                             // magic seed
+       str     x1,[x29,#112]
+#ifndef        __AARCH64EB__
+       rev     w3,w3                   // 0
+#endif
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       eor     w6,w24,w24,ror#14
+       and     w17,w25,w24
+       bic     w19,w26,w24
+       add     w27,w27,w3                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w6,ror#11       // Sigma1(e)
+       ror     w6,w20,#2
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       eor     w17,w20,w20,ror#9
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w23,w23,w27                     // d+=h
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w6,w17,ror#13       // Sigma0(a)
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w27,w27,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w4,w4                   // 1
+#endif
+       ldp     w5,w6,[x1],#2*4
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       eor     w7,w23,w23,ror#14
+       and     w17,w24,w23
+       bic     w28,w25,w23
+       add     w26,w26,w4                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w7,ror#11       // Sigma1(e)
+       ror     w7,w27,#2
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       eor     w17,w27,w27,ror#9
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w22,w22,w26                     // d+=h
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w7,w17,ror#13       // Sigma0(a)
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w26,w26,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w5,w5                   // 2
+#endif
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       eor     w8,w22,w22,ror#14
+       and     w17,w23,w22
+       bic     w19,w24,w22
+       add     w25,w25,w5                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w8,ror#11       // Sigma1(e)
+       ror     w8,w26,#2
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       eor     w17,w26,w26,ror#9
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w21,w21,w25                     // d+=h
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w8,w17,ror#13       // Sigma0(a)
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w25,w25,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w6,w6                   // 3
+#endif
+       ldp     w7,w8,[x1],#2*4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       eor     w9,w21,w21,ror#14
+       and     w17,w22,w21
+       bic     w28,w23,w21
+       add     w24,w24,w6                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w9,ror#11       // Sigma1(e)
+       ror     w9,w25,#2
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       eor     w17,w25,w25,ror#9
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w20,w20,w24                     // d+=h
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w9,w17,ror#13       // Sigma0(a)
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w24,w24,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w7,w7                   // 4
+#endif
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       eor     w10,w20,w20,ror#14
+       and     w17,w21,w20
+       bic     w19,w22,w20
+       add     w23,w23,w7                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w10,ror#11      // Sigma1(e)
+       ror     w10,w24,#2
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       eor     w17,w24,w24,ror#9
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w27,w27,w23                     // d+=h
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w10,w17,ror#13      // Sigma0(a)
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w23,w23,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w8,w8                   // 5
+#endif
+       ldp     w9,w10,[x1],#2*4
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       eor     w11,w27,w27,ror#14
+       and     w17,w20,w27
+       bic     w28,w21,w27
+       add     w22,w22,w8                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w11,ror#11      // Sigma1(e)
+       ror     w11,w23,#2
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       eor     w17,w23,w23,ror#9
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w26,w26,w22                     // d+=h
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w11,w17,ror#13      // Sigma0(a)
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w22,w22,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w9,w9                   // 6
+#endif
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       eor     w12,w26,w26,ror#14
+       and     w17,w27,w26
+       bic     w19,w20,w26
+       add     w21,w21,w9                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w12,ror#11      // Sigma1(e)
+       ror     w12,w22,#2
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       eor     w17,w22,w22,ror#9
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w25,w25,w21                     // d+=h
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w12,w17,ror#13      // Sigma0(a)
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w21,w21,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w10,w10                 // 7
+#endif
+       ldp     w11,w12,[x1],#2*4
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       eor     w13,w25,w25,ror#14
+       and     w17,w26,w25
+       bic     w28,w27,w25
+       add     w20,w20,w10                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w13,ror#11      // Sigma1(e)
+       ror     w13,w21,#2
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       eor     w17,w21,w21,ror#9
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w24,w24,w20                     // d+=h
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w13,w17,ror#13      // Sigma0(a)
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w20,w20,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w11,w11                 // 8
+#endif
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       eor     w14,w24,w24,ror#14
+       and     w17,w25,w24
+       bic     w19,w26,w24
+       add     w27,w27,w11                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w14,ror#11      // Sigma1(e)
+       ror     w14,w20,#2
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       eor     w17,w20,w20,ror#9
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w23,w23,w27                     // d+=h
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w14,w17,ror#13      // Sigma0(a)
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w27,w27,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w12,w12                 // 9
+#endif
+       ldp     w13,w14,[x1],#2*4
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       eor     w15,w23,w23,ror#14
+       and     w17,w24,w23
+       bic     w28,w25,w23
+       add     w26,w26,w12                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w15,ror#11      // Sigma1(e)
+       ror     w15,w27,#2
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       eor     w17,w27,w27,ror#9
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w22,w22,w26                     // d+=h
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w15,w17,ror#13      // Sigma0(a)
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w26,w26,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w13,w13                 // 10
+#endif
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       eor     w0,w22,w22,ror#14
+       and     w17,w23,w22
+       bic     w19,w24,w22
+       add     w25,w25,w13                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w0,ror#11       // Sigma1(e)
+       ror     w0,w26,#2
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       eor     w17,w26,w26,ror#9
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w21,w21,w25                     // d+=h
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w0,w17,ror#13       // Sigma0(a)
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w25,w25,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w14,w14                 // 11
+#endif
+       ldp     w15,w0,[x1],#2*4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       str     w6,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       eor     w6,w21,w21,ror#14
+       and     w17,w22,w21
+       bic     w28,w23,w21
+       add     w24,w24,w14                     // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w6,ror#11       // Sigma1(e)
+       ror     w6,w25,#2
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       eor     w17,w25,w25,ror#9
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w20,w20,w24                     // d+=h
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w6,w17,ror#13       // Sigma0(a)
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w24,w24,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w15,w15                 // 12
+#endif
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       str     w7,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       eor     w7,w20,w20,ror#14
+       and     w17,w21,w20
+       bic     w19,w22,w20
+       add     w23,w23,w15                     // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w7,ror#11       // Sigma1(e)
+       ror     w7,w24,#2
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       eor     w17,w24,w24,ror#9
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w27,w27,w23                     // d+=h
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w7,w17,ror#13       // Sigma0(a)
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w23,w23,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w0,w0                   // 13
+#endif
+       ldp     w1,w2,[x1]
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       str     w8,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       eor     w8,w27,w27,ror#14
+       and     w17,w20,w27
+       bic     w28,w21,w27
+       add     w22,w22,w0                      // h+=X[i]
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w8,ror#11       // Sigma1(e)
+       ror     w8,w23,#2
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       eor     w17,w23,w23,ror#9
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       add     w26,w26,w22                     // d+=h
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w8,w17,ror#13       // Sigma0(a)
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       //add   w22,w22,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w1,w1                   // 14
+#endif
+       ldr     w6,[sp,#12]
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       str     w9,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       eor     w9,w26,w26,ror#14
+       and     w17,w27,w26
+       bic     w19,w20,w26
+       add     w21,w21,w1                      // h+=X[i]
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w9,ror#11       // Sigma1(e)
+       ror     w9,w22,#2
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       eor     w17,w22,w22,ror#9
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       add     w25,w25,w21                     // d+=h
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w9,w17,ror#13       // Sigma0(a)
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       //add   w21,w21,w17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     w2,w2                   // 15
+#endif
+       ldr     w7,[sp,#0]
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       str     w10,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w9,w4,#7
+       and     w17,w26,w25
+       ror     w8,w1,#17
+       bic     w28,w27,w25
+       ror     w10,w21,#2
+       add     w20,w20,w2                      // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w9,w9,w4,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w10,w10,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w8,w8,w1,ror#19
+       eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w10,w21,ror#22      // Sigma0(a)
+       eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
+       add     w3,w3,w12
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w3,w3,w9
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w3,w3,w8
+.Loop_16_xx:
+       ldr     w8,[sp,#4]
+       str     w11,[sp,#0]
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       ror     w10,w5,#7
+       and     w17,w25,w24
+       ror     w9,w2,#17
+       bic     w19,w26,w24
+       ror     w11,w20,#2
+       add     w27,w27,w3                      // h+=X[i]
+       eor     w16,w16,w24,ror#11
+       eor     w10,w10,w5,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w24,ror#25      // Sigma1(e)
+       eor     w11,w11,w20,ror#13
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w9,w9,w2,ror#19
+       eor     w10,w10,w5,lsr#3        // sigma0(X[i+1])
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w11,w20,ror#22      // Sigma0(a)
+       eor     w9,w9,w2,lsr#10 // sigma1(X[i+14])
+       add     w4,w4,w13
+       add     w23,w23,w27                     // d+=h
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w4,w4,w10
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       add     w4,w4,w9
+       ldr     w9,[sp,#8]
+       str     w12,[sp,#4]
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       ror     w11,w6,#7
+       and     w17,w24,w23
+       ror     w10,w3,#17
+       bic     w28,w25,w23
+       ror     w12,w27,#2
+       add     w26,w26,w4                      // h+=X[i]
+       eor     w16,w16,w23,ror#11
+       eor     w11,w11,w6,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w23,ror#25      // Sigma1(e)
+       eor     w12,w12,w27,ror#13
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w10,w10,w3,ror#19
+       eor     w11,w11,w6,lsr#3        // sigma0(X[i+1])
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w12,w27,ror#22      // Sigma0(a)
+       eor     w10,w10,w3,lsr#10       // sigma1(X[i+14])
+       add     w5,w5,w14
+       add     w22,w22,w26                     // d+=h
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w5,w5,w11
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       add     w5,w5,w10
+       ldr     w10,[sp,#12]
+       str     w13,[sp,#8]
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       ror     w12,w7,#7
+       and     w17,w23,w22
+       ror     w11,w4,#17
+       bic     w19,w24,w22
+       ror     w13,w26,#2
+       add     w25,w25,w5                      // h+=X[i]
+       eor     w16,w16,w22,ror#11
+       eor     w12,w12,w7,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w22,ror#25      // Sigma1(e)
+       eor     w13,w13,w26,ror#13
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w11,w11,w4,ror#19
+       eor     w12,w12,w7,lsr#3        // sigma0(X[i+1])
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w13,w26,ror#22      // Sigma0(a)
+       eor     w11,w11,w4,lsr#10       // sigma1(X[i+14])
+       add     w6,w6,w15
+       add     w21,w21,w25                     // d+=h
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w6,w6,w12
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       add     w6,w6,w11
+       ldr     w11,[sp,#0]
+       str     w14,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       ror     w13,w8,#7
+       and     w17,w22,w21
+       ror     w12,w5,#17
+       bic     w28,w23,w21
+       ror     w14,w25,#2
+       add     w24,w24,w6                      // h+=X[i]
+       eor     w16,w16,w21,ror#11
+       eor     w13,w13,w8,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w21,ror#25      // Sigma1(e)
+       eor     w14,w14,w25,ror#13
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w12,w12,w5,ror#19
+       eor     w13,w13,w8,lsr#3        // sigma0(X[i+1])
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w14,w25,ror#22      // Sigma0(a)
+       eor     w12,w12,w5,lsr#10       // sigma1(X[i+14])
+       add     w7,w7,w0
+       add     w20,w20,w24                     // d+=h
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w7,w7,w13
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       add     w7,w7,w12
+       ldr     w12,[sp,#4]
+       str     w15,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       ror     w14,w9,#7
+       and     w17,w21,w20
+       ror     w13,w6,#17
+       bic     w19,w22,w20
+       ror     w15,w24,#2
+       add     w23,w23,w7                      // h+=X[i]
+       eor     w16,w16,w20,ror#11
+       eor     w14,w14,w9,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w20,ror#25      // Sigma1(e)
+       eor     w15,w15,w24,ror#13
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w13,w13,w6,ror#19
+       eor     w14,w14,w9,lsr#3        // sigma0(X[i+1])
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w15,w24,ror#22      // Sigma0(a)
+       eor     w13,w13,w6,lsr#10       // sigma1(X[i+14])
+       add     w8,w8,w1
+       add     w27,w27,w23                     // d+=h
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w8,w8,w14
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       add     w8,w8,w13
+       ldr     w13,[sp,#8]
+       str     w0,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       ror     w15,w10,#7
+       and     w17,w20,w27
+       ror     w14,w7,#17
+       bic     w28,w21,w27
+       ror     w0,w23,#2
+       add     w22,w22,w8                      // h+=X[i]
+       eor     w16,w16,w27,ror#11
+       eor     w15,w15,w10,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w27,ror#25      // Sigma1(e)
+       eor     w0,w0,w23,ror#13
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w14,w14,w7,ror#19
+       eor     w15,w15,w10,lsr#3       // sigma0(X[i+1])
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w0,w23,ror#22       // Sigma0(a)
+       eor     w14,w14,w7,lsr#10       // sigma1(X[i+14])
+       add     w9,w9,w2
+       add     w26,w26,w22                     // d+=h
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w9,w9,w15
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       add     w9,w9,w14
+       ldr     w14,[sp,#12]
+       str     w1,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       ror     w0,w11,#7
+       and     w17,w27,w26
+       ror     w15,w8,#17
+       bic     w19,w20,w26
+       ror     w1,w22,#2
+       add     w21,w21,w9                      // h+=X[i]
+       eor     w16,w16,w26,ror#11
+       eor     w0,w0,w11,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w26,ror#25      // Sigma1(e)
+       eor     w1,w1,w22,ror#13
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w15,w15,w8,ror#19
+       eor     w0,w0,w11,lsr#3 // sigma0(X[i+1])
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w1,w22,ror#22       // Sigma0(a)
+       eor     w15,w15,w8,lsr#10       // sigma1(X[i+14])
+       add     w10,w10,w3
+       add     w25,w25,w21                     // d+=h
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w10,w10,w0
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       add     w10,w10,w15
+       ldr     w15,[sp,#0]
+       str     w2,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w1,w12,#7
+       and     w17,w26,w25
+       ror     w0,w9,#17
+       bic     w28,w27,w25
+       ror     w2,w21,#2
+       add     w20,w20,w10                     // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w1,w1,w12,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w2,w2,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w0,w0,w9,ror#19
+       eor     w1,w1,w12,lsr#3 // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w2,w21,ror#22       // Sigma0(a)
+       eor     w0,w0,w9,lsr#10 // sigma1(X[i+14])
+       add     w11,w11,w4
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w11,w11,w1
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w11,w11,w0
+       ldr     w0,[sp,#4]
+       str     w3,[sp,#0]
+       ror     w16,w24,#6
+       add     w27,w27,w19                     // h+=K[i]
+       ror     w2,w13,#7
+       and     w17,w25,w24
+       ror     w1,w10,#17
+       bic     w19,w26,w24
+       ror     w3,w20,#2
+       add     w27,w27,w11                     // h+=X[i]
+       eor     w16,w16,w24,ror#11
+       eor     w2,w2,w13,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w20,w21                     // a^b, b^c in next round
+       eor     w16,w16,w24,ror#25      // Sigma1(e)
+       eor     w3,w3,w20,ror#13
+       add     w27,w27,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w1,w1,w10,ror#19
+       eor     w2,w2,w13,lsr#3 // sigma0(X[i+1])
+       add     w27,w27,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w21                     // Maj(a,b,c)
+       eor     w17,w3,w20,ror#22       // Sigma0(a)
+       eor     w1,w1,w10,lsr#10        // sigma1(X[i+14])
+       add     w12,w12,w5
+       add     w23,w23,w27                     // d+=h
+       add     w27,w27,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w12,w12,w2
+       add     w27,w27,w17                     // h+=Sigma0(a)
+       add     w12,w12,w1
+       ldr     w1,[sp,#8]
+       str     w4,[sp,#4]
+       ror     w16,w23,#6
+       add     w26,w26,w28                     // h+=K[i]
+       ror     w3,w14,#7
+       and     w17,w24,w23
+       ror     w2,w11,#17
+       bic     w28,w25,w23
+       ror     w4,w27,#2
+       add     w26,w26,w12                     // h+=X[i]
+       eor     w16,w16,w23,ror#11
+       eor     w3,w3,w14,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w27,w20                     // a^b, b^c in next round
+       eor     w16,w16,w23,ror#25      // Sigma1(e)
+       eor     w4,w4,w27,ror#13
+       add     w26,w26,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w2,w2,w11,ror#19
+       eor     w3,w3,w14,lsr#3 // sigma0(X[i+1])
+       add     w26,w26,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w20                     // Maj(a,b,c)
+       eor     w17,w4,w27,ror#22       // Sigma0(a)
+       eor     w2,w2,w11,lsr#10        // sigma1(X[i+14])
+       add     w13,w13,w6
+       add     w22,w22,w26                     // d+=h
+       add     w26,w26,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w13,w13,w3
+       add     w26,w26,w17                     // h+=Sigma0(a)
+       add     w13,w13,w2
+       ldr     w2,[sp,#12]
+       str     w5,[sp,#8]
+       ror     w16,w22,#6
+       add     w25,w25,w19                     // h+=K[i]
+       ror     w4,w15,#7
+       and     w17,w23,w22
+       ror     w3,w12,#17
+       bic     w19,w24,w22
+       ror     w5,w26,#2
+       add     w25,w25,w13                     // h+=X[i]
+       eor     w16,w16,w22,ror#11
+       eor     w4,w4,w15,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w26,w27                     // a^b, b^c in next round
+       eor     w16,w16,w22,ror#25      // Sigma1(e)
+       eor     w5,w5,w26,ror#13
+       add     w25,w25,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w3,w3,w12,ror#19
+       eor     w4,w4,w15,lsr#3 // sigma0(X[i+1])
+       add     w25,w25,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w27                     // Maj(a,b,c)
+       eor     w17,w5,w26,ror#22       // Sigma0(a)
+       eor     w3,w3,w12,lsr#10        // sigma1(X[i+14])
+       add     w14,w14,w7
+       add     w21,w21,w25                     // d+=h
+       add     w25,w25,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w14,w14,w4
+       add     w25,w25,w17                     // h+=Sigma0(a)
+       add     w14,w14,w3
+       ldr     w3,[sp,#0]
+       str     w6,[sp,#12]
+       ror     w16,w21,#6
+       add     w24,w24,w28                     // h+=K[i]
+       ror     w5,w0,#7
+       and     w17,w22,w21
+       ror     w4,w13,#17
+       bic     w28,w23,w21
+       ror     w6,w25,#2
+       add     w24,w24,w14                     // h+=X[i]
+       eor     w16,w16,w21,ror#11
+       eor     w5,w5,w0,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w25,w26                     // a^b, b^c in next round
+       eor     w16,w16,w21,ror#25      // Sigma1(e)
+       eor     w6,w6,w25,ror#13
+       add     w24,w24,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w4,w4,w13,ror#19
+       eor     w5,w5,w0,lsr#3  // sigma0(X[i+1])
+       add     w24,w24,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w26                     // Maj(a,b,c)
+       eor     w17,w6,w25,ror#22       // Sigma0(a)
+       eor     w4,w4,w13,lsr#10        // sigma1(X[i+14])
+       add     w15,w15,w8
+       add     w20,w20,w24                     // d+=h
+       add     w24,w24,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w15,w15,w5
+       add     w24,w24,w17                     // h+=Sigma0(a)
+       add     w15,w15,w4
+       ldr     w4,[sp,#4]
+       str     w7,[sp,#0]
+       ror     w16,w20,#6
+       add     w23,w23,w19                     // h+=K[i]
+       ror     w6,w1,#7
+       and     w17,w21,w20
+       ror     w5,w14,#17
+       bic     w19,w22,w20
+       ror     w7,w24,#2
+       add     w23,w23,w15                     // h+=X[i]
+       eor     w16,w16,w20,ror#11
+       eor     w6,w6,w1,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w24,w25                     // a^b, b^c in next round
+       eor     w16,w16,w20,ror#25      // Sigma1(e)
+       eor     w7,w7,w24,ror#13
+       add     w23,w23,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w5,w5,w14,ror#19
+       eor     w6,w6,w1,lsr#3  // sigma0(X[i+1])
+       add     w23,w23,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w25                     // Maj(a,b,c)
+       eor     w17,w7,w24,ror#22       // Sigma0(a)
+       eor     w5,w5,w14,lsr#10        // sigma1(X[i+14])
+       add     w0,w0,w9
+       add     w27,w27,w23                     // d+=h
+       add     w23,w23,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w0,w0,w6
+       add     w23,w23,w17                     // h+=Sigma0(a)
+       add     w0,w0,w5
+       ldr     w5,[sp,#8]
+       str     w8,[sp,#4]
+       ror     w16,w27,#6
+       add     w22,w22,w28                     // h+=K[i]
+       ror     w7,w2,#7
+       and     w17,w20,w27
+       ror     w6,w15,#17
+       bic     w28,w21,w27
+       ror     w8,w23,#2
+       add     w22,w22,w0                      // h+=X[i]
+       eor     w16,w16,w27,ror#11
+       eor     w7,w7,w2,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w23,w24                     // a^b, b^c in next round
+       eor     w16,w16,w27,ror#25      // Sigma1(e)
+       eor     w8,w8,w23,ror#13
+       add     w22,w22,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w6,w6,w15,ror#19
+       eor     w7,w7,w2,lsr#3  // sigma0(X[i+1])
+       add     w22,w22,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w24                     // Maj(a,b,c)
+       eor     w17,w8,w23,ror#22       // Sigma0(a)
+       eor     w6,w6,w15,lsr#10        // sigma1(X[i+14])
+       add     w1,w1,w10
+       add     w26,w26,w22                     // d+=h
+       add     w22,w22,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w1,w1,w7
+       add     w22,w22,w17                     // h+=Sigma0(a)
+       add     w1,w1,w6
+       ldr     w6,[sp,#12]
+       str     w9,[sp,#8]
+       ror     w16,w26,#6
+       add     w21,w21,w19                     // h+=K[i]
+       ror     w8,w3,#7
+       and     w17,w27,w26
+       ror     w7,w0,#17
+       bic     w19,w20,w26
+       ror     w9,w22,#2
+       add     w21,w21,w1                      // h+=X[i]
+       eor     w16,w16,w26,ror#11
+       eor     w8,w8,w3,ror#18
+       orr     w17,w17,w19                     // Ch(e,f,g)
+       eor     w19,w22,w23                     // a^b, b^c in next round
+       eor     w16,w16,w26,ror#25      // Sigma1(e)
+       eor     w9,w9,w22,ror#13
+       add     w21,w21,w17                     // h+=Ch(e,f,g)
+       and     w28,w28,w19                     // (b^c)&=(a^b)
+       eor     w7,w7,w0,ror#19
+       eor     w8,w8,w3,lsr#3  // sigma0(X[i+1])
+       add     w21,w21,w16                     // h+=Sigma1(e)
+       eor     w28,w28,w23                     // Maj(a,b,c)
+       eor     w17,w9,w22,ror#22       // Sigma0(a)
+       eor     w7,w7,w0,lsr#10 // sigma1(X[i+14])
+       add     w2,w2,w11
+       add     w25,w25,w21                     // d+=h
+       add     w21,w21,w28                     // h+=Maj(a,b,c)
+       ldr     w28,[x30],#4            // *K++, w19 in next round
+       add     w2,w2,w8
+       add     w21,w21,w17                     // h+=Sigma0(a)
+       add     w2,w2,w7
+       ldr     w7,[sp,#0]
+       str     w10,[sp,#12]
+       ror     w16,w25,#6
+       add     w20,w20,w28                     // h+=K[i]
+       ror     w9,w4,#7
+       and     w17,w26,w25
+       ror     w8,w1,#17
+       bic     w28,w27,w25
+       ror     w10,w21,#2
+       add     w20,w20,w2                      // h+=X[i]
+       eor     w16,w16,w25,ror#11
+       eor     w9,w9,w4,ror#18
+       orr     w17,w17,w28                     // Ch(e,f,g)
+       eor     w28,w21,w22                     // a^b, b^c in next round
+       eor     w16,w16,w25,ror#25      // Sigma1(e)
+       eor     w10,w10,w21,ror#13
+       add     w20,w20,w17                     // h+=Ch(e,f,g)
+       and     w19,w19,w28                     // (b^c)&=(a^b)
+       eor     w8,w8,w1,ror#19
+       eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
+       add     w20,w20,w16                     // h+=Sigma1(e)
+       eor     w19,w19,w22                     // Maj(a,b,c)
+       eor     w17,w10,w21,ror#22      // Sigma0(a)
+       eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
+       add     w3,w3,w12
+       add     w24,w24,w20                     // d+=h
+       add     w20,w20,w19                     // h+=Maj(a,b,c)
+       ldr     w19,[x30],#4            // *K++, w28 in next round
+       add     w3,w3,w9
+       add     w20,w20,w17                     // h+=Sigma0(a)
+       add     w3,w3,w8
+       cbnz    w19,.Loop_16_xx
+
+       ldp     x0,x2,[x29,#96]
+       ldr     x1,[x29,#112]
+       sub     x30,x30,#260            // rewind
+
+       ldp     w3,w4,[x0]
+       ldp     w5,w6,[x0,#2*4]
+       add     x1,x1,#14*4                     // advance input pointer
+       ldp     w7,w8,[x0,#4*4]
+       add     w20,w20,w3
+       ldp     w9,w10,[x0,#6*4]
+       add     w21,w21,w4
+       add     w22,w22,w5
+       add     w23,w23,w6
+       stp     w20,w21,[x0]
+       add     w24,w24,w7
+       add     w25,w25,w8
+       stp     w22,w23,[x0,#2*4]
+       add     w26,w26,w9
+       add     w27,w27,w10
+       cmp     x1,x2
+       stp     w24,w25,[x0,#4*4]
+       stp     w26,w27,[x0,#6*4]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*4
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
+
+.globl zfs_sha256_block_armv8
+.type  zfs_sha256_block_armv8,%function
+.align 6
+zfs_sha256_block_armv8:
+.Lv8_entry:
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1             {v0.4s,v1.4s},[x0]
+       adr             x3,.LK256
+
+.Loop_hw:
+       ld1             {v4.16b-v7.16b},[x1],#64
+       sub             x2,x2,#1
+       ld1             {v16.4s},[x3],#16
+       rev32           v4.16b,v4.16b
+       rev32           v5.16b,v5.16b
+       rev32           v6.16b,v6.16b
+       rev32           v7.16b,v7.16b
+       orr             v18.16b,v0.16b,v0.16b           // offload
+       orr             v19.16b,v1.16b,v1.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v6.4s
+       .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+       .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v7.4s
+       .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+       .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
+       ld1             {v17.4s},[x3],#16
+       add             v16.4s,v16.4s,v4.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+
+       ld1             {v16.4s},[x3],#16
+       add             v17.4s,v17.4s,v5.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+
+       ld1             {v17.4s},[x3]
+       add             v16.4s,v16.4s,v6.4s
+       sub             x3,x3,#64*4-16  // rewind
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
+       .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
+
+       add             v17.4s,v17.4s,v7.4s
+       orr             v2.16b,v0.16b,v0.16b
+       .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
+       .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
+
+       add             v0.4s,v0.4s,v18.4s
+       add             v1.4s,v1.4s,v19.4s
+
+       cbnz            x2,.Loop_hw
+
+       st1             {v0.4s,v1.4s},[x0]
+
+       ldr             x29,[sp],#16
+       ret
+.size  zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
+
+.globl zfs_sha256_block_neon
+.type  zfs_sha256_block_neon,%function
+.align 4
+zfs_sha256_block_neon:
+.Lneon_entry:
+       stp     x29, x30, [sp, #-16]!
+       mov     x29, sp
+       sub     sp,sp,#16*4
+
+       adr     x16,.LK256
+       add     x2,x1,x2,lsl#6  // len to point at the end of inp
+
+       ld1     {v0.16b},[x1], #16
+       ld1     {v1.16b},[x1], #16
+       ld1     {v2.16b},[x1], #16
+       ld1     {v3.16b},[x1], #16
+       ld1     {v4.4s},[x16], #16
+       ld1     {v5.4s},[x16], #16
+       ld1     {v6.4s},[x16], #16
+       ld1     {v7.4s},[x16], #16
+       rev32   v0.16b,v0.16b           // yes, even on
+       rev32   v1.16b,v1.16b           // big-endian
+       rev32   v2.16b,v2.16b
+       rev32   v3.16b,v3.16b
+       mov     x17,sp
+       add     v4.4s,v4.4s,v0.4s
+       add     v5.4s,v5.4s,v1.4s
+       add     v6.4s,v6.4s,v2.4s
+       st1     {v4.4s-v5.4s},[x17], #32
+       add     v7.4s,v7.4s,v3.4s
+       st1     {v6.4s-v7.4s},[x17]
+       sub     x17,x17,#32
+
+       ldp     w3,w4,[x0]
+       ldp     w5,w6,[x0,#8]
+       ldp     w7,w8,[x0,#16]
+       ldp     w9,w10,[x0,#24]
+       ldr     w12,[sp,#0]
+       mov     w13,wzr
+       eor     w14,w4,w5
+       mov     w15,wzr
+       b       .L_00_48
+
+.align 4
+.L_00_48:
+       ext     v4.16b,v0.16b,v1.16b,#4
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       bic     w15,w9,w7
+       ext     v7.16b,v2.16b,v3.16b,#4
+       eor     w11,w7,w7,ror#5
+       add     w3,w3,w13
+       mov     d19,v3.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w3,w3,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w10,w10,w12
+       add     v0.4s,v0.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w10,w10,w11
+       ldr     w12,[sp,#4]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w4
+       ushr    v16.4s,v19.4s,#17
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       add     v0.4s,v0.4s,v5.4s
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w9,w9,w11
+       ldr     w12,[sp,#8]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       eor     v17.16b,v17.16b,v7.16b
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       add     v0.4s,v0.4s,v17.4s
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       ushr    v18.4s,v0.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v0.4s,#10
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       sli     v18.4s,v0.4s,#15
+       add     w8,w8,w12
+       ushr    v17.4s,v0.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       sli     v17.4s,v0.4s,#13
+       ldr     w12,[sp,#12]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w4,w4,w8
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w10
+       eor     v17.16b,v17.16b,v17.16b
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       add     v0.4s,v0.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     v4.4s,v4.4s,v0.4s
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v1.16b,v2.16b,#4
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       bic     w15,w5,w3
+       ext     v7.16b,v3.16b,v0.16b,#4
+       eor     w11,w3,w3,ror#5
+       add     w7,w7,w13
+       mov     d19,v0.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w7,w7,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w6,w6,w12
+       add     v1.4s,v1.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w6,w6,w11
+       ldr     w12,[sp,#20]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w8
+       ushr    v16.4s,v19.4s,#17
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       add     v1.4s,v1.4s,v5.4s
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w5,w5,w11
+       ldr     w12,[sp,#24]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       eor     v17.16b,v17.16b,v7.16b
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       add     v1.4s,v1.4s,v17.4s
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       ushr    v18.4s,v1.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v1.4s,#10
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       sli     v18.4s,v1.4s,#15
+       add     w4,w4,w12
+       ushr    v17.4s,v1.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       sli     v17.4s,v1.4s,#13
+       ldr     w12,[sp,#28]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w8,w8,w4
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w6
+       eor     v17.16b,v17.16b,v17.16b
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       add     v1.4s,v1.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     v4.4s,v4.4s,v1.4s
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[sp,#32]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v2.16b,v3.16b,#4
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       bic     w15,w9,w7
+       ext     v7.16b,v0.16b,v1.16b,#4
+       eor     w11,w7,w7,ror#5
+       add     w3,w3,w13
+       mov     d19,v1.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w3,w3,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w10,w10,w12
+       add     v2.4s,v2.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w10,w10,w11
+       ldr     w12,[sp,#36]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w4
+       ushr    v16.4s,v19.4s,#17
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       add     v2.4s,v2.4s,v5.4s
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w9,w9,w11
+       ldr     w12,[sp,#40]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       eor     v17.16b,v17.16b,v7.16b
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       add     v2.4s,v2.4s,v17.4s
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       ushr    v18.4s,v2.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v2.4s,#10
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       sli     v18.4s,v2.4s,#15
+       add     w8,w8,w12
+       ushr    v17.4s,v2.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       sli     v17.4s,v2.4s,#13
+       ldr     w12,[sp,#44]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w4,w4,w8
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w10
+       eor     v17.16b,v17.16b,v17.16b
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       add     v2.4s,v2.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     v4.4s,v4.4s,v2.4s
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#48]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       ext     v4.16b,v3.16b,v0.16b,#4
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       bic     w15,w5,w3
+       ext     v7.16b,v1.16b,v2.16b,#4
+       eor     w11,w3,w3,ror#5
+       add     w7,w7,w13
+       mov     d19,v2.d[1]
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       ushr    v6.4s,v4.4s,#7
+       eor     w15,w7,w7,ror#11
+       ushr    v5.4s,v4.4s,#3
+       add     w6,w6,w12
+       add     v3.4s,v3.4s,v7.4s
+       ror     w11,w11,#6
+       sli     v6.4s,v4.4s,#25
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       ushr    v7.4s,v4.4s,#18
+       add     w6,w6,w11
+       ldr     w12,[sp,#52]
+       and     w14,w14,w13
+       eor     v5.16b,v5.16b,v6.16b
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       sli     v7.4s,v4.4s,#14
+       eor     w14,w14,w8
+       ushr    v16.4s,v19.4s,#17
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       eor     v5.16b,v5.16b,v7.16b
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       sli     v16.4s,v19.4s,#15
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       ushr    v17.4s,v19.4s,#10
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       ushr    v7.4s,v19.4s,#19
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       add     v3.4s,v3.4s,v5.4s
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       sli     v7.4s,v19.4s,#13
+       add     w5,w5,w11
+       ldr     w12,[sp,#56]
+       and     w13,w13,w14
+       eor     v17.16b,v17.16b,v16.16b
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       eor     v17.16b,v17.16b,v7.16b
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       add     v3.4s,v3.4s,v17.4s
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       ushr    v18.4s,v3.4s,#17
+       orr     w12,w12,w15
+       ushr    v19.4s,v3.4s,#10
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       sli     v18.4s,v3.4s,#15
+       add     w4,w4,w12
+       ushr    v17.4s,v3.4s,#19
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     v19.16b,v19.16b,v18.16b
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       sli     v17.4s,v3.4s,#13
+       ldr     w12,[sp,#60]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       ld1     {v4.4s},[x16], #16
+       add     w8,w8,w4
+       eor     v19.16b,v19.16b,v17.16b
+       eor     w14,w14,w6
+       eor     v17.16b,v17.16b,v17.16b
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       mov     v17.d[1],v19.d[0]
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       add     v3.4s,v3.4s,v17.4s
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     v4.4s,v4.4s,v3.4s
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[x16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       cmp     w12,#0                          // check for K256 terminator
+       ldr     w12,[sp,#0]
+       sub     x17,x17,#64
+       bne     .L_00_48
+
+       sub     x16,x16,#256            // rewind x16
+       cmp     x1,x2
+       mov     x17, #64
+       csel    x17, x17, xzr, eq
+       sub     x1,x1,x17                       // avoid SEGV
+       mov     x17,sp
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       ld1     {v0.16b},[x1],#16
+       bic     w15,w9,w7
+       eor     w11,w7,w7,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w3,w3,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       eor     w15,w3,w3,ror#11
+       rev32   v0.16b,v0.16b
+       add     w10,w10,w12
+       ror     w11,w11,#6
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       add     v4.4s,v4.4s,v0.4s
+       add     w10,w10,w11
+       ldr     w12,[sp,#4]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       eor     w14,w14,w4
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       add     w9,w9,w11
+       ldr     w12,[sp,#8]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       add     w8,w8,w12
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       ldr     w12,[sp,#12]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w4,w4,w8
+       eor     w14,w14,w10
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#16]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       ld1     {v1.16b},[x1],#16
+       bic     w15,w5,w3
+       eor     w11,w3,w3,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w7,w7,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       eor     w15,w7,w7,ror#11
+       rev32   v1.16b,v1.16b
+       add     w6,w6,w12
+       ror     w11,w11,#6
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       add     v4.4s,v4.4s,v1.4s
+       add     w6,w6,w11
+       ldr     w12,[sp,#20]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       eor     w14,w14,w8
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       add     w5,w5,w11
+       ldr     w12,[sp,#24]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       add     w4,w4,w12
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       ldr     w12,[sp,#28]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w8,w8,w4
+       eor     w14,w14,w6
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       ldr     w12,[sp,#32]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       add     w10,w10,w12
+       add     w3,w3,w15
+       and     w12,w8,w7
+       ld1     {v2.16b},[x1],#16
+       bic     w15,w9,w7
+       eor     w11,w7,w7,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w3,w3,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w7,ror#19
+       eor     w15,w3,w3,ror#11
+       rev32   v2.16b,v2.16b
+       add     w10,w10,w12
+       ror     w11,w11,#6
+       eor     w13,w3,w4
+       eor     w15,w15,w3,ror#20
+       add     v4.4s,v4.4s,v2.4s
+       add     w10,w10,w11
+       ldr     w12,[sp,#36]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w6,w6,w10
+       eor     w14,w14,w4
+       add     w9,w9,w12
+       add     w10,w10,w15
+       and     w12,w7,w6
+       bic     w15,w8,w6
+       eor     w11,w6,w6,ror#5
+       add     w10,w10,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w6,ror#19
+       eor     w15,w10,w10,ror#11
+       add     w9,w9,w12
+       ror     w11,w11,#6
+       eor     w14,w10,w3
+       eor     w15,w15,w10,ror#20
+       add     w9,w9,w11
+       ldr     w12,[sp,#40]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w5,w5,w9
+       eor     w13,w13,w3
+       add     w8,w8,w12
+       add     w9,w9,w15
+       and     w12,w6,w5
+       bic     w15,w7,w5
+       eor     w11,w5,w5,ror#5
+       add     w9,w9,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w5,ror#19
+       eor     w15,w9,w9,ror#11
+       add     w8,w8,w12
+       ror     w11,w11,#6
+       eor     w13,w9,w10
+       eor     w15,w15,w9,ror#20
+       add     w8,w8,w11
+       ldr     w12,[sp,#44]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w4,w4,w8
+       eor     w14,w14,w10
+       add     w7,w7,w12
+       add     w8,w8,w15
+       and     w12,w5,w4
+       bic     w15,w6,w4
+       eor     w11,w4,w4,ror#5
+       add     w8,w8,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w4,ror#19
+       eor     w15,w8,w8,ror#11
+       add     w7,w7,w12
+       ror     w11,w11,#6
+       eor     w14,w8,w9
+       eor     w15,w15,w8,ror#20
+       add     w7,w7,w11
+       ldr     w12,[sp,#48]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w3,w3,w7
+       eor     w13,w13,w9
+       st1     {v4.4s},[x17], #16
+       add     w6,w6,w12
+       add     w7,w7,w15
+       and     w12,w4,w3
+       ld1     {v3.16b},[x1],#16
+       bic     w15,w5,w3
+       eor     w11,w3,w3,ror#5
+       ld1     {v4.4s},[x16],#16
+       add     w7,w7,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w3,ror#19
+       eor     w15,w7,w7,ror#11
+       rev32   v3.16b,v3.16b
+       add     w6,w6,w12
+       ror     w11,w11,#6
+       eor     w13,w7,w8
+       eor     w15,w15,w7,ror#20
+       add     v4.4s,v4.4s,v3.4s
+       add     w6,w6,w11
+       ldr     w12,[sp,#52]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w10,w10,w6
+       eor     w14,w14,w8
+       add     w5,w5,w12
+       add     w6,w6,w15
+       and     w12,w3,w10
+       bic     w15,w4,w10
+       eor     w11,w10,w10,ror#5
+       add     w6,w6,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w10,ror#19
+       eor     w15,w6,w6,ror#11
+       add     w5,w5,w12
+       ror     w11,w11,#6
+       eor     w14,w6,w7
+       eor     w15,w15,w6,ror#20
+       add     w5,w5,w11
+       ldr     w12,[sp,#56]
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w9,w9,w5
+       eor     w13,w13,w7
+       add     w4,w4,w12
+       add     w5,w5,w15
+       and     w12,w10,w9
+       bic     w15,w3,w9
+       eor     w11,w9,w9,ror#5
+       add     w5,w5,w13
+       orr     w12,w12,w15
+       eor     w11,w11,w9,ror#19
+       eor     w15,w5,w5,ror#11
+       add     w4,w4,w12
+       ror     w11,w11,#6
+       eor     w13,w5,w6
+       eor     w15,w15,w5,ror#20
+       add     w4,w4,w11
+       ldr     w12,[sp,#60]
+       and     w14,w14,w13
+       ror     w15,w15,#2
+       add     w8,w8,w4
+       eor     w14,w14,w6
+       add     w3,w3,w12
+       add     w4,w4,w15
+       and     w12,w9,w8
+       bic     w15,w10,w8
+       eor     w11,w8,w8,ror#5
+       add     w4,w4,w14
+       orr     w12,w12,w15
+       eor     w11,w11,w8,ror#19
+       eor     w15,w4,w4,ror#11
+       add     w3,w3,w12
+       ror     w11,w11,#6
+       eor     w14,w4,w5
+       eor     w15,w15,w4,ror#20
+       add     w3,w3,w11
+       and     w13,w13,w14
+       ror     w15,w15,#2
+       add     w7,w7,w3
+       eor     w13,w13,w5
+       st1     {v4.4s},[x17], #16
+       add     w3,w3,w15                       // h+=Sigma0(a) from the past
+       ldp     w11,w12,[x0,#0]
+       add     w3,w3,w13                       // h+=Maj(a,b,c) from the past
+       ldp     w13,w14,[x0,#8]
+       add     w3,w3,w11                       // accumulate
+       add     w4,w4,w12
+       ldp     w11,w12,[x0,#16]
+       add     w5,w5,w13
+       add     w6,w6,w14
+       ldp     w13,w14,[x0,#24]
+       add     w7,w7,w11
+       add     w8,w8,w12
+        ldr    w12,[sp,#0]
+       stp     w3,w4,[x0,#0]
+       add     w9,w9,w13
+        mov    w13,wzr
+       stp     w5,w6,[x0,#8]
+       add     w10,w10,w14
+       stp     w7,w8,[x0,#16]
+        eor    w14,w4,w5
+       stp     w9,w10,[x0,#24]
+        mov    w15,wzr
+        mov    x17,sp
+       b.ne    .L_00_48
+
+       ldr     x29,[x29]
+       add     sp,sp,#16*4+16
+       ret
+.size  zfs_sha256_block_neon,.-zfs_sha256_block_neon
+
+#endif
diff --git a/module/icp/asm-aarch64/sha2/sha512-armv8.S b/module/icp/asm-aarch64/sha2/sha512-armv8.S
new file mode 100644 (file)
index 0000000..1683fc1
--- /dev/null
@@ -0,0 +1,1558 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__aarch64__)
+
+.text
+
+.align 6
+.type  .LK512,%object
+.LK512:
+       .quad   0x428a2f98d728ae22,0x7137449123ef65cd
+       .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+       .quad   0x3956c25bf348b538,0x59f111f1b605d019
+       .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
+       .quad   0xd807aa98a3030242,0x12835b0145706fbe
+       .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+       .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
+       .quad   0x9bdc06a725c71235,0xc19bf174cf692694
+       .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
+       .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+       .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
+       .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+       .quad   0x983e5152ee66dfab,0xa831c66d2db43210
+       .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
+       .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
+       .quad   0x06ca6351e003826f,0x142929670a0e6e70
+       .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
+       .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+       .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
+       .quad   0x81c2c92e47edaee6,0x92722c851482353b
+       .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
+       .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
+       .quad   0xd192e819d6ef5218,0xd69906245565a910
+       .quad   0xf40e35855771202a,0x106aa07032bbd1b8
+       .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
+       .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+       .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+       .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+       .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
+       .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
+       .quad   0x90befffa23631e28,0xa4506cebde82bde9
+       .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
+       .quad   0xca273eceea26619c,0xd186b8c721c0c207
+       .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+       .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
+       .quad   0x113f9804bef90dae,0x1b710b35131c471b
+       .quad   0x28db77f523047d84,0x32caab7b40c72493
+       .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+       .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+       .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
+       .quad   0       // terminator
+.size  .LK512,.-.LK512
+
+.globl zfs_sha512_block_armv7
+.type  zfs_sha512_block_armv7,%function
+.align 6
+zfs_sha512_block_armv7:
+       stp     x29,x30,[sp,#-128]!
+       add     x29,sp,#0
+
+       stp     x19,x20,[sp,#16]
+       stp     x21,x22,[sp,#32]
+       stp     x23,x24,[sp,#48]
+       stp     x25,x26,[sp,#64]
+       stp     x27,x28,[sp,#80]
+       sub     sp,sp,#4*8
+
+       ldp     x20,x21,[x0]                            // load context
+       ldp     x22,x23,[x0,#2*8]
+       ldp     x24,x25,[x0,#4*8]
+       add     x2,x1,x2,lsl#7  // end of input
+       ldp     x26,x27,[x0,#6*8]
+       adr     x30,.LK512
+       stp     x0,x2,[x29,#96]
+
+.Loop:
+       ldp     x3,x4,[x1],#2*8
+       ldr     x19,[x30],#8                    // *K++
+       eor     x28,x21,x22                             // magic seed
+       str     x1,[x29,#112]
+#ifndef        __AARCH64EB__
+       rev     x3,x3                   // 0
+#endif
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       eor     x6,x24,x24,ror#23
+       and     x17,x25,x24
+       bic     x19,x26,x24
+       add     x27,x27,x3                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x6,ror#18       // Sigma1(e)
+       ror     x6,x20,#28
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       eor     x17,x20,x20,ror#5
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x23,x23,x27                     // d+=h
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x6,x17,ror#34       // Sigma0(a)
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x27,x27,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x4,x4                   // 1
+#endif
+       ldp     x5,x6,[x1],#2*8
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       eor     x7,x23,x23,ror#23
+       and     x17,x24,x23
+       bic     x28,x25,x23
+       add     x26,x26,x4                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x7,ror#18       // Sigma1(e)
+       ror     x7,x27,#28
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       eor     x17,x27,x27,ror#5
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x22,x22,x26                     // d+=h
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x7,x17,ror#34       // Sigma0(a)
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x26,x26,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x5,x5                   // 2
+#endif
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       eor     x8,x22,x22,ror#23
+       and     x17,x23,x22
+       bic     x19,x24,x22
+       add     x25,x25,x5                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x8,ror#18       // Sigma1(e)
+       ror     x8,x26,#28
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       eor     x17,x26,x26,ror#5
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x21,x21,x25                     // d+=h
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x8,x17,ror#34       // Sigma0(a)
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x25,x25,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x6,x6                   // 3
+#endif
+       ldp     x7,x8,[x1],#2*8
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       eor     x9,x21,x21,ror#23
+       and     x17,x22,x21
+       bic     x28,x23,x21
+       add     x24,x24,x6                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x9,ror#18       // Sigma1(e)
+       ror     x9,x25,#28
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       eor     x17,x25,x25,ror#5
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x20,x20,x24                     // d+=h
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x9,x17,ror#34       // Sigma0(a)
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x24,x24,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x7,x7                   // 4
+#endif
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       eor     x10,x20,x20,ror#23
+       and     x17,x21,x20
+       bic     x19,x22,x20
+       add     x23,x23,x7                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x10,ror#18      // Sigma1(e)
+       ror     x10,x24,#28
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       eor     x17,x24,x24,ror#5
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x27,x27,x23                     // d+=h
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x10,x17,ror#34      // Sigma0(a)
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x23,x23,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x8,x8                   // 5
+#endif
+       ldp     x9,x10,[x1],#2*8
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       eor     x11,x27,x27,ror#23
+       and     x17,x20,x27
+       bic     x28,x21,x27
+       add     x22,x22,x8                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x11,ror#18      // Sigma1(e)
+       ror     x11,x23,#28
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       eor     x17,x23,x23,ror#5
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x26,x26,x22                     // d+=h
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x11,x17,ror#34      // Sigma0(a)
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x22,x22,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x9,x9                   // 6
+#endif
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       eor     x12,x26,x26,ror#23
+       and     x17,x27,x26
+       bic     x19,x20,x26
+       add     x21,x21,x9                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x12,ror#18      // Sigma1(e)
+       ror     x12,x22,#28
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       eor     x17,x22,x22,ror#5
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x25,x25,x21                     // d+=h
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x12,x17,ror#34      // Sigma0(a)
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x21,x21,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x10,x10                 // 7
+#endif
+       ldp     x11,x12,[x1],#2*8
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       eor     x13,x25,x25,ror#23
+       and     x17,x26,x25
+       bic     x28,x27,x25
+       add     x20,x20,x10                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x13,ror#18      // Sigma1(e)
+       ror     x13,x21,#28
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       eor     x17,x21,x21,ror#5
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x24,x24,x20                     // d+=h
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x13,x17,ror#34      // Sigma0(a)
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x20,x20,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x11,x11                 // 8
+#endif
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       eor     x14,x24,x24,ror#23
+       and     x17,x25,x24
+       bic     x19,x26,x24
+       add     x27,x27,x11                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x14,ror#18      // Sigma1(e)
+       ror     x14,x20,#28
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       eor     x17,x20,x20,ror#5
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x23,x23,x27                     // d+=h
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x14,x17,ror#34      // Sigma0(a)
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x27,x27,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x12,x12                 // 9
+#endif
+       ldp     x13,x14,[x1],#2*8
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       eor     x15,x23,x23,ror#23
+       and     x17,x24,x23
+       bic     x28,x25,x23
+       add     x26,x26,x12                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x15,ror#18      // Sigma1(e)
+       ror     x15,x27,#28
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       eor     x17,x27,x27,ror#5
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x22,x22,x26                     // d+=h
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x15,x17,ror#34      // Sigma0(a)
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x26,x26,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x13,x13                 // 10
+#endif
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       eor     x0,x22,x22,ror#23
+       and     x17,x23,x22
+       bic     x19,x24,x22
+       add     x25,x25,x13                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x0,ror#18       // Sigma1(e)
+       ror     x0,x26,#28
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       eor     x17,x26,x26,ror#5
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x21,x21,x25                     // d+=h
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x0,x17,ror#34       // Sigma0(a)
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x25,x25,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x14,x14                 // 11
+#endif
+       ldp     x15,x0,[x1],#2*8
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       str     x6,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       eor     x6,x21,x21,ror#23
+       and     x17,x22,x21
+       bic     x28,x23,x21
+       add     x24,x24,x14                     // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x6,ror#18       // Sigma1(e)
+       ror     x6,x25,#28
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       eor     x17,x25,x25,ror#5
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x20,x20,x24                     // d+=h
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x6,x17,ror#34       // Sigma0(a)
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x24,x24,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x15,x15                 // 12
+#endif
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       str     x7,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       eor     x7,x20,x20,ror#23
+       and     x17,x21,x20
+       bic     x19,x22,x20
+       add     x23,x23,x15                     // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x7,ror#18       // Sigma1(e)
+       ror     x7,x24,#28
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       eor     x17,x24,x24,ror#5
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x27,x27,x23                     // d+=h
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x7,x17,ror#34       // Sigma0(a)
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x23,x23,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x0,x0                   // 13
+#endif
+       ldp     x1,x2,[x1]
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       str     x8,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       eor     x8,x27,x27,ror#23
+       and     x17,x20,x27
+       bic     x28,x21,x27
+       add     x22,x22,x0                      // h+=X[i]
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x8,ror#18       // Sigma1(e)
+       ror     x8,x23,#28
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       eor     x17,x23,x23,ror#5
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       add     x26,x26,x22                     // d+=h
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x8,x17,ror#34       // Sigma0(a)
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       //add   x22,x22,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x1,x1                   // 14
+#endif
+       ldr     x6,[sp,#24]
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       str     x9,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       eor     x9,x26,x26,ror#23
+       and     x17,x27,x26
+       bic     x19,x20,x26
+       add     x21,x21,x1                      // h+=X[i]
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x9,ror#18       // Sigma1(e)
+       ror     x9,x22,#28
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       eor     x17,x22,x22,ror#5
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       add     x25,x25,x21                     // d+=h
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x9,x17,ror#34       // Sigma0(a)
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       //add   x21,x21,x17                     // h+=Sigma0(a)
+#ifndef        __AARCH64EB__
+       rev     x2,x2                   // 15
+#endif
+       ldr     x7,[sp,#0]
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       str     x10,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x9,x4,#1
+       and     x17,x26,x25
+       ror     x8,x1,#19
+       bic     x28,x27,x25
+       ror     x10,x21,#28
+       add     x20,x20,x2                      // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x9,x9,x4,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x10,x10,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x8,x8,x1,ror#61
+       eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x10,x21,ror#39      // Sigma0(a)
+       eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
+       add     x3,x3,x12
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x3,x3,x9
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x3,x3,x8
+.Loop_16_xx:
+       ldr     x8,[sp,#8]
+       str     x11,[sp,#0]
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       ror     x10,x5,#1
+       and     x17,x25,x24
+       ror     x9,x2,#19
+       bic     x19,x26,x24
+       ror     x11,x20,#28
+       add     x27,x27,x3                      // h+=X[i]
+       eor     x16,x16,x24,ror#18
+       eor     x10,x10,x5,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x24,ror#41      // Sigma1(e)
+       eor     x11,x11,x20,ror#34
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x9,x9,x2,ror#61
+       eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x11,x20,ror#39      // Sigma0(a)
+       eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
+       add     x4,x4,x13
+       add     x23,x23,x27                     // d+=h
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x4,x4,x10
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       add     x4,x4,x9
+       ldr     x9,[sp,#16]
+       str     x12,[sp,#8]
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       ror     x11,x6,#1
+       and     x17,x24,x23
+       ror     x10,x3,#19
+       bic     x28,x25,x23
+       ror     x12,x27,#28
+       add     x26,x26,x4                      // h+=X[i]
+       eor     x16,x16,x23,ror#18
+       eor     x11,x11,x6,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x23,ror#41      // Sigma1(e)
+       eor     x12,x12,x27,ror#34
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x10,x10,x3,ror#61
+       eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x12,x27,ror#39      // Sigma0(a)
+       eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
+       add     x5,x5,x14
+       add     x22,x22,x26                     // d+=h
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x5,x5,x11
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       add     x5,x5,x10
+       ldr     x10,[sp,#24]
+       str     x13,[sp,#16]
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       ror     x12,x7,#1
+       and     x17,x23,x22
+       ror     x11,x4,#19
+       bic     x19,x24,x22
+       ror     x13,x26,#28
+       add     x25,x25,x5                      // h+=X[i]
+       eor     x16,x16,x22,ror#18
+       eor     x12,x12,x7,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x22,ror#41      // Sigma1(e)
+       eor     x13,x13,x26,ror#34
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x11,x11,x4,ror#61
+       eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x13,x26,ror#39      // Sigma0(a)
+       eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
+       add     x6,x6,x15
+       add     x21,x21,x25                     // d+=h
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x6,x6,x12
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       add     x6,x6,x11
+       ldr     x11,[sp,#0]
+       str     x14,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       ror     x13,x8,#1
+       and     x17,x22,x21
+       ror     x12,x5,#19
+       bic     x28,x23,x21
+       ror     x14,x25,#28
+       add     x24,x24,x6                      // h+=X[i]
+       eor     x16,x16,x21,ror#18
+       eor     x13,x13,x8,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x21,ror#41      // Sigma1(e)
+       eor     x14,x14,x25,ror#34
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x12,x12,x5,ror#61
+       eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x14,x25,ror#39      // Sigma0(a)
+       eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
+       add     x7,x7,x0
+       add     x20,x20,x24                     // d+=h
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x7,x7,x13
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       add     x7,x7,x12
+       ldr     x12,[sp,#8]
+       str     x15,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       ror     x14,x9,#1
+       and     x17,x21,x20
+       ror     x13,x6,#19
+       bic     x19,x22,x20
+       ror     x15,x24,#28
+       add     x23,x23,x7                      // h+=X[i]
+       eor     x16,x16,x20,ror#18
+       eor     x14,x14,x9,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x20,ror#41      // Sigma1(e)
+       eor     x15,x15,x24,ror#34
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x13,x13,x6,ror#61
+       eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x15,x24,ror#39      // Sigma0(a)
+       eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
+       add     x8,x8,x1
+       add     x27,x27,x23                     // d+=h
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x8,x8,x14
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       add     x8,x8,x13
+       ldr     x13,[sp,#16]
+       str     x0,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       ror     x15,x10,#1
+       and     x17,x20,x27
+       ror     x14,x7,#19
+       bic     x28,x21,x27
+       ror     x0,x23,#28
+       add     x22,x22,x8                      // h+=X[i]
+       eor     x16,x16,x27,ror#18
+       eor     x15,x15,x10,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x27,ror#41      // Sigma1(e)
+       eor     x0,x0,x23,ror#34
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x14,x14,x7,ror#61
+       eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x0,x23,ror#39       // Sigma0(a)
+       eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
+       add     x9,x9,x2
+       add     x26,x26,x22                     // d+=h
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x9,x9,x15
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       add     x9,x9,x14
+       ldr     x14,[sp,#24]
+       str     x1,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       ror     x0,x11,#1
+       and     x17,x27,x26
+       ror     x15,x8,#19
+       bic     x19,x20,x26
+       ror     x1,x22,#28
+       add     x21,x21,x9                      // h+=X[i]
+       eor     x16,x16,x26,ror#18
+       eor     x0,x0,x11,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x26,ror#41      // Sigma1(e)
+       eor     x1,x1,x22,ror#34
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x15,x15,x8,ror#61
+       eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x1,x22,ror#39       // Sigma0(a)
+       eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
+       add     x10,x10,x3
+       add     x25,x25,x21                     // d+=h
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x10,x10,x0
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       add     x10,x10,x15
+       ldr     x15,[sp,#0]
+       str     x2,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x1,x12,#1
+       and     x17,x26,x25
+       ror     x0,x9,#19
+       bic     x28,x27,x25
+       ror     x2,x21,#28
+       add     x20,x20,x10                     // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x1,x1,x12,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x2,x2,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x0,x0,x9,ror#61
+       eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x2,x21,ror#39       // Sigma0(a)
+       eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
+       add     x11,x11,x4
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x11,x11,x1
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x11,x11,x0
+       ldr     x0,[sp,#8]
+       str     x3,[sp,#0]
+       ror     x16,x24,#14
+       add     x27,x27,x19                     // h+=K[i]
+       ror     x2,x13,#1
+       and     x17,x25,x24
+       ror     x1,x10,#19
+       bic     x19,x26,x24
+       ror     x3,x20,#28
+       add     x27,x27,x11                     // h+=X[i]
+       eor     x16,x16,x24,ror#18
+       eor     x2,x2,x13,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x20,x21                     // a^b, b^c in next round
+       eor     x16,x16,x24,ror#41      // Sigma1(e)
+       eor     x3,x3,x20,ror#34
+       add     x27,x27,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x1,x1,x10,ror#61
+       eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
+       add     x27,x27,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x21                     // Maj(a,b,c)
+       eor     x17,x3,x20,ror#39       // Sigma0(a)
+       eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
+       add     x12,x12,x5
+       add     x23,x23,x27                     // d+=h
+       add     x27,x27,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x12,x12,x2
+       add     x27,x27,x17                     // h+=Sigma0(a)
+       add     x12,x12,x1
+       ldr     x1,[sp,#16]
+       str     x4,[sp,#8]
+       ror     x16,x23,#14
+       add     x26,x26,x28                     // h+=K[i]
+       ror     x3,x14,#1
+       and     x17,x24,x23
+       ror     x2,x11,#19
+       bic     x28,x25,x23
+       ror     x4,x27,#28
+       add     x26,x26,x12                     // h+=X[i]
+       eor     x16,x16,x23,ror#18
+       eor     x3,x3,x14,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x27,x20                     // a^b, b^c in next round
+       eor     x16,x16,x23,ror#41      // Sigma1(e)
+       eor     x4,x4,x27,ror#34
+       add     x26,x26,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x2,x2,x11,ror#61
+       eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
+       add     x26,x26,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x20                     // Maj(a,b,c)
+       eor     x17,x4,x27,ror#39       // Sigma0(a)
+       eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
+       add     x13,x13,x6
+       add     x22,x22,x26                     // d+=h
+       add     x26,x26,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x13,x13,x3
+       add     x26,x26,x17                     // h+=Sigma0(a)
+       add     x13,x13,x2
+       ldr     x2,[sp,#24]
+       str     x5,[sp,#16]
+       ror     x16,x22,#14
+       add     x25,x25,x19                     // h+=K[i]
+       ror     x4,x15,#1
+       and     x17,x23,x22
+       ror     x3,x12,#19
+       bic     x19,x24,x22
+       ror     x5,x26,#28
+       add     x25,x25,x13                     // h+=X[i]
+       eor     x16,x16,x22,ror#18
+       eor     x4,x4,x15,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x26,x27                     // a^b, b^c in next round
+       eor     x16,x16,x22,ror#41      // Sigma1(e)
+       eor     x5,x5,x26,ror#34
+       add     x25,x25,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x3,x3,x12,ror#61
+       eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
+       add     x25,x25,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x27                     // Maj(a,b,c)
+       eor     x17,x5,x26,ror#39       // Sigma0(a)
+       eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
+       add     x14,x14,x7
+       add     x21,x21,x25                     // d+=h
+       add     x25,x25,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x14,x14,x4
+       add     x25,x25,x17                     // h+=Sigma0(a)
+       add     x14,x14,x3
+       ldr     x3,[sp,#0]
+       str     x6,[sp,#24]
+       ror     x16,x21,#14
+       add     x24,x24,x28                     // h+=K[i]
+       ror     x5,x0,#1
+       and     x17,x22,x21
+       ror     x4,x13,#19
+       bic     x28,x23,x21
+       ror     x6,x25,#28
+       add     x24,x24,x14                     // h+=X[i]
+       eor     x16,x16,x21,ror#18
+       eor     x5,x5,x0,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x25,x26                     // a^b, b^c in next round
+       eor     x16,x16,x21,ror#41      // Sigma1(e)
+       eor     x6,x6,x25,ror#34
+       add     x24,x24,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x4,x4,x13,ror#61
+       eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
+       add     x24,x24,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x26                     // Maj(a,b,c)
+       eor     x17,x6,x25,ror#39       // Sigma0(a)
+       eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
+       add     x15,x15,x8
+       add     x20,x20,x24                     // d+=h
+       add     x24,x24,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x15,x15,x5
+       add     x24,x24,x17                     // h+=Sigma0(a)
+       add     x15,x15,x4
+       ldr     x4,[sp,#8]
+       str     x7,[sp,#0]
+       ror     x16,x20,#14
+       add     x23,x23,x19                     // h+=K[i]
+       ror     x6,x1,#1
+       and     x17,x21,x20
+       ror     x5,x14,#19
+       bic     x19,x22,x20
+       ror     x7,x24,#28
+       add     x23,x23,x15                     // h+=X[i]
+       eor     x16,x16,x20,ror#18
+       eor     x6,x6,x1,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x24,x25                     // a^b, b^c in next round
+       eor     x16,x16,x20,ror#41      // Sigma1(e)
+       eor     x7,x7,x24,ror#34
+       add     x23,x23,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x5,x5,x14,ror#61
+       eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
+       add     x23,x23,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x25                     // Maj(a,b,c)
+       eor     x17,x7,x24,ror#39       // Sigma0(a)
+       eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
+       add     x0,x0,x9
+       add     x27,x27,x23                     // d+=h
+       add     x23,x23,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x0,x0,x6
+       add     x23,x23,x17                     // h+=Sigma0(a)
+       add     x0,x0,x5
+       ldr     x5,[sp,#16]
+       str     x8,[sp,#8]
+       ror     x16,x27,#14
+       add     x22,x22,x28                     // h+=K[i]
+       ror     x7,x2,#1
+       and     x17,x20,x27
+       ror     x6,x15,#19
+       bic     x28,x21,x27
+       ror     x8,x23,#28
+       add     x22,x22,x0                      // h+=X[i]
+       eor     x16,x16,x27,ror#18
+       eor     x7,x7,x2,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x23,x24                     // a^b, b^c in next round
+       eor     x16,x16,x27,ror#41      // Sigma1(e)
+       eor     x8,x8,x23,ror#34
+       add     x22,x22,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x6,x6,x15,ror#61
+       eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
+       add     x22,x22,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x24                     // Maj(a,b,c)
+       eor     x17,x8,x23,ror#39       // Sigma0(a)
+       eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
+       add     x1,x1,x10
+       add     x26,x26,x22                     // d+=h
+       add     x22,x22,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x1,x1,x7
+       add     x22,x22,x17                     // h+=Sigma0(a)
+       add     x1,x1,x6
+       ldr     x6,[sp,#24]
+       str     x9,[sp,#16]
+       ror     x16,x26,#14
+       add     x21,x21,x19                     // h+=K[i]
+       ror     x8,x3,#1
+       and     x17,x27,x26
+       ror     x7,x0,#19
+       bic     x19,x20,x26
+       ror     x9,x22,#28
+       add     x21,x21,x1                      // h+=X[i]
+       eor     x16,x16,x26,ror#18
+       eor     x8,x8,x3,ror#8
+       orr     x17,x17,x19                     // Ch(e,f,g)
+       eor     x19,x22,x23                     // a^b, b^c in next round
+       eor     x16,x16,x26,ror#41      // Sigma1(e)
+       eor     x9,x9,x22,ror#34
+       add     x21,x21,x17                     // h+=Ch(e,f,g)
+       and     x28,x28,x19                     // (b^c)&=(a^b)
+       eor     x7,x7,x0,ror#61
+       eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
+       add     x21,x21,x16                     // h+=Sigma1(e)
+       eor     x28,x28,x23                     // Maj(a,b,c)
+       eor     x17,x9,x22,ror#39       // Sigma0(a)
+       eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
+       add     x2,x2,x11
+       add     x25,x25,x21                     // d+=h
+       add     x21,x21,x28                     // h+=Maj(a,b,c)
+       ldr     x28,[x30],#8            // *K++, x19 in next round
+       add     x2,x2,x8
+       add     x21,x21,x17                     // h+=Sigma0(a)
+       add     x2,x2,x7
+       ldr     x7,[sp,#0]
+       str     x10,[sp,#24]
+       ror     x16,x25,#14
+       add     x20,x20,x28                     // h+=K[i]
+       ror     x9,x4,#1
+       and     x17,x26,x25
+       ror     x8,x1,#19
+       bic     x28,x27,x25
+       ror     x10,x21,#28
+       add     x20,x20,x2                      // h+=X[i]
+       eor     x16,x16,x25,ror#18
+       eor     x9,x9,x4,ror#8
+       orr     x17,x17,x28                     // Ch(e,f,g)
+       eor     x28,x21,x22                     // a^b, b^c in next round
+       eor     x16,x16,x25,ror#41      // Sigma1(e)
+       eor     x10,x10,x21,ror#34
+       add     x20,x20,x17                     // h+=Ch(e,f,g)
+       and     x19,x19,x28                     // (b^c)&=(a^b)
+       eor     x8,x8,x1,ror#61
+       eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
+       add     x20,x20,x16                     // h+=Sigma1(e)
+       eor     x19,x19,x22                     // Maj(a,b,c)
+       eor     x17,x10,x21,ror#39      // Sigma0(a)
+       eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
+       add     x3,x3,x12
+       add     x24,x24,x20                     // d+=h
+       add     x20,x20,x19                     // h+=Maj(a,b,c)
+       ldr     x19,[x30],#8            // *K++, x28 in next round
+       add     x3,x3,x9
+       add     x20,x20,x17                     // h+=Sigma0(a)
+       add     x3,x3,x8
+       cbnz    x19,.Loop_16_xx
+
+       ldp     x0,x2,[x29,#96]
+       ldr     x1,[x29,#112]
+       sub     x30,x30,#648            // rewind
+
+       ldp     x3,x4,[x0]
+       ldp     x5,x6,[x0,#2*8]
+       add     x1,x1,#14*8                     // advance input pointer
+       ldp     x7,x8,[x0,#4*8]
+       add     x20,x20,x3
+       ldp     x9,x10,[x0,#6*8]
+       add     x21,x21,x4
+       add     x22,x22,x5
+       add     x23,x23,x6
+       stp     x20,x21,[x0]
+       add     x24,x24,x7
+       add     x25,x25,x8
+       stp     x22,x23,[x0,#2*8]
+       add     x26,x26,x9
+       add     x27,x27,x10
+       cmp     x1,x2
+       stp     x24,x25,[x0,#4*8]
+       stp     x26,x27,[x0,#6*8]
+       b.ne    .Loop
+
+       ldp     x19,x20,[x29,#16]
+       add     sp,sp,#4*8
+       ldp     x21,x22,[x29,#32]
+       ldp     x23,x24,[x29,#48]
+       ldp     x25,x26,[x29,#64]
+       ldp     x27,x28,[x29,#80]
+       ldp     x29,x30,[sp],#128
+       ret
+.size  zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
+
+
+.globl zfs_sha512_block_armv8
+.type  zfs_sha512_block_armv8,%function
+.align 6
+zfs_sha512_block_armv8:
+.Lv8_entry:
+       // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+
+       ld1             {v16.16b-v19.16b},[x1],#64      // load input
+       ld1             {v20.16b-v23.16b},[x1],#64
+
+       ld1             {v0.2d-v3.2d},[x0]              // load context
+       adr             x3,.LK512
+
+       rev64           v16.16b,v16.16b
+       rev64           v17.16b,v17.16b
+       rev64           v18.16b,v18.16b
+       rev64           v19.16b,v19.16b
+       rev64           v20.16b,v20.16b
+       rev64           v21.16b,v21.16b
+       rev64           v22.16b,v22.16b
+       rev64           v23.16b,v23.16b
+       b               .Loop_hw
+
+.align 4
+.Loop_hw:
+       ld1             {v24.2d},[x3],#16
+       subs            x2,x2,#1
+       sub             x4,x1,#128
+       orr             v26.16b,v0.16b,v0.16b                   // offload
+       orr             v27.16b,v1.16b,v1.16b
+       orr             v28.16b,v2.16b,v2.16b
+       orr             v29.16b,v3.16b,v3.16b
+       csel            x1,x1,x4,ne                     // conditional rewind
+       add             v24.2d,v24.2d,v16.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
+        ext            v7.16b,v20.16b,v21.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v25.2d,v25.2d,v17.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
+        ext            v7.16b,v21.16b,v22.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v24.2d,v24.2d,v18.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
+        ext            v7.16b,v22.16b,v23.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v25.2d,v25.2d,v19.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
+        ext            v7.16b,v23.16b,v16.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v24.2d,v24.2d,v20.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
+        ext            v7.16b,v16.16b,v17.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v25.2d,v25.2d,v21.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
+        ext            v7.16b,v17.16b,v18.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v24.2d,v24.2d,v22.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
+        ext            v7.16b,v18.16b,v19.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v25.2d,v25.2d,v23.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
+        ext            v7.16b,v19.16b,v20.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v24.2d,v24.2d,v16.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
+        ext            v7.16b,v20.16b,v21.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v25.2d,v25.2d,v17.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
+        ext            v7.16b,v21.16b,v22.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v24.2d,v24.2d,v18.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
+        ext            v7.16b,v22.16b,v23.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v25.2d,v25.2d,v19.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
+        ext            v7.16b,v23.16b,v16.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v24.2d,v24.2d,v20.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
+        ext            v7.16b,v16.16b,v17.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v25.2d,v25.2d,v21.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
+        ext            v7.16b,v17.16b,v18.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v24.2d,v24.2d,v22.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
+        ext            v7.16b,v18.16b,v19.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v25.2d,v25.2d,v23.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
+        ext            v7.16b,v19.16b,v20.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v24.2d,v24.2d,v16.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
+        ext            v7.16b,v20.16b,v21.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v25.2d,v25.2d,v17.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
+        ext            v7.16b,v21.16b,v22.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v24.2d,v24.2d,v18.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
+        ext            v7.16b,v22.16b,v23.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v25.2d,v25.2d,v19.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
+        ext            v7.16b,v23.16b,v16.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v24.2d,v24.2d,v20.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
+        ext            v7.16b,v16.16b,v17.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v25.2d,v25.2d,v21.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
+        ext            v7.16b,v17.16b,v18.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v24.2d,v24.2d,v22.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
+        ext            v7.16b,v18.16b,v19.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v25.2d,v25.2d,v23.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
+        ext            v7.16b,v19.16b,v20.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v24.2d,v24.2d,v16.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08230      //sha512su0 v16.16b,v17.16b
+        ext            v7.16b,v20.16b,v21.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678af0      //sha512su1 v16.16b,v23.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v25.2d,v25.2d,v17.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08251      //sha512su0 v17.16b,v18.16b
+        ext            v7.16b,v21.16b,v22.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678a11      //sha512su1 v17.16b,v16.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v24.2d,v24.2d,v18.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08272      //sha512su0 v18.16b,v19.16b
+        ext            v7.16b,v22.16b,v23.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678a32      //sha512su1 v18.16b,v17.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       add             v25.2d,v25.2d,v19.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08293      //sha512su0 v19.16b,v20.16b
+        ext            v7.16b,v23.16b,v16.16b,#8
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        .inst  0xce678a53      //sha512su1 v19.16b,v18.16b,v7.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       add             v24.2d,v24.2d,v20.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082b4      //sha512su0 v20.16b,v21.16b
+        ext            v7.16b,v16.16b,v17.16b,#8
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        .inst  0xce678a74      //sha512su1 v20.16b,v19.16b,v7.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       add             v25.2d,v25.2d,v21.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082d5      //sha512su0 v21.16b,v22.16b
+        ext            v7.16b,v17.16b,v18.16b,#8
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        .inst  0xce678a95      //sha512su1 v21.16b,v20.16b,v7.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v24.2d,v24.2d,v22.2d
+       ld1             {v25.2d},[x3],#16
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v24.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec082f6      //sha512su0 v22.16b,v23.16b
+        ext            v7.16b,v18.16b,v19.16b,#8
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        .inst  0xce678ab6      //sha512su1 v22.16b,v21.16b,v7.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       add             v25.2d,v25.2d,v23.2d
+       ld1             {v24.2d},[x3],#16
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v25.2d                      // "T1 + H + K512[i]"
+        .inst  0xcec08217      //sha512su0 v23.16b,v16.16b
+        ext            v7.16b,v19.16b,v20.16b,#8
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        .inst  0xce678ad7      //sha512su1 v23.16b,v22.16b,v7.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       ld1             {v25.2d},[x3],#16
+       add             v24.2d,v24.2d,v16.2d
+        ld1            {v16.16b},[x1],#16              // load next input
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v24.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        rev64          v16.16b,v16.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       ld1             {v24.2d},[x3],#16
+       add             v25.2d,v25.2d,v17.2d
+        ld1            {v17.16b},[x1],#16              // load next input
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v25.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        rev64          v17.16b,v17.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       ld1             {v25.2d},[x3],#16
+       add             v24.2d,v24.2d,v18.2d
+        ld1            {v18.16b},[x1],#16              // load next input
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v24.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        rev64          v18.16b,v18.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       ld1             {v24.2d},[x3],#16
+       add             v25.2d,v25.2d,v19.2d
+        ld1            {v19.16b},[x1],#16              // load next input
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v2.16b,v3.16b,#8
+       ext             v6.16b,v1.16b,v2.16b,#8
+       add             v3.2d,v3.2d,v25.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a3      //sha512h v3.16b,v5.16b,v6.16b
+        rev64          v19.16b,v19.16b
+       add             v4.2d,v1.2d,v3.2d               // "D + T1"
+       .inst   0xce608423      //sha512h2 v3.16b,v1.16b,v0.16b
+       ld1             {v25.2d},[x3],#16
+       add             v24.2d,v24.2d,v20.2d
+        ld1            {v20.16b},[x1],#16              // load next input
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v4.16b,v2.16b,#8
+       ext             v6.16b,v0.16b,v4.16b,#8
+       add             v2.2d,v2.2d,v24.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a2      //sha512h v2.16b,v5.16b,v6.16b
+        rev64          v20.16b,v20.16b
+       add             v1.2d,v0.2d,v2.2d               // "D + T1"
+       .inst   0xce638402      //sha512h2 v2.16b,v0.16b,v3.16b
+       ld1             {v24.2d},[x3],#16
+       add             v25.2d,v25.2d,v21.2d
+        ld1            {v21.16b},[x1],#16              // load next input
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v1.16b,v4.16b,#8
+       ext             v6.16b,v3.16b,v1.16b,#8
+       add             v4.2d,v4.2d,v25.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a4      //sha512h v4.16b,v5.16b,v6.16b
+        rev64          v21.16b,v21.16b
+       add             v0.2d,v3.2d,v4.2d               // "D + T1"
+       .inst   0xce628464      //sha512h2 v4.16b,v3.16b,v2.16b
+       ld1             {v25.2d},[x3],#16
+       add             v24.2d,v24.2d,v22.2d
+        ld1            {v22.16b},[x1],#16              // load next input
+       ext             v24.16b,v24.16b,v24.16b,#8
+       ext             v5.16b,v0.16b,v1.16b,#8
+       ext             v6.16b,v2.16b,v0.16b,#8
+       add             v1.2d,v1.2d,v24.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a1      //sha512h v1.16b,v5.16b,v6.16b
+        rev64          v22.16b,v22.16b
+       add             v3.2d,v2.2d,v1.2d               // "D + T1"
+       .inst   0xce648441      //sha512h2 v1.16b,v2.16b,v4.16b
+       sub             x3,x3,#80*8     // rewind
+       add             v25.2d,v25.2d,v23.2d
+        ld1            {v23.16b},[x1],#16              // load next input
+       ext             v25.16b,v25.16b,v25.16b,#8
+       ext             v5.16b,v3.16b,v0.16b,#8
+       ext             v6.16b,v4.16b,v3.16b,#8
+       add             v0.2d,v0.2d,v25.2d                      // "T1 + H + K512[i]"
+       .inst   0xce6680a0      //sha512h v0.16b,v5.16b,v6.16b
+        rev64          v23.16b,v23.16b
+       add             v2.2d,v4.2d,v0.2d               // "D + T1"
+       .inst   0xce618480      //sha512h2 v0.16b,v4.16b,v1.16b
+       add             v0.2d,v0.2d,v26.2d                      // accumulate
+       add             v1.2d,v1.2d,v27.2d
+       add             v2.2d,v2.2d,v28.2d
+       add             v3.2d,v3.2d,v29.2d
+
+       cbnz            x2,.Loop_hw
+
+       st1             {v0.2d-v3.2d},[x0]              // store context
+
+       ldr             x29,[sp],#16
+       ret
+.size  zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
+#endif
diff --git a/module/icp/asm-arm/sha2/sha256-armv7.S b/module/icp/asm-arm/sha2/sha256-armv7.S
new file mode 100644 (file)
index 0000000..0001e4d
--- /dev/null
@@ -0,0 +1,2769 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__arm__)
+
+#define        __ARM_ARCH__      7
+#define        __ARM_MAX_ARCH__  7
+
+#if defined(__thumb2__)
+.syntax unified
+.thumb
+#else
+.code   32
+#endif
+
+.text
+
+.type  K256,%object
+.align 5
+K256:
+.word  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.word  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.word  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.word  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.word  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.word  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.word  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.word  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.word  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.word  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.word  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.word  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.word  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.word  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.size  K256,.-K256
+.word  0                               @ terminator
+
+.align 5
+.globl zfs_sha256_block_armv7
+.type  zfs_sha256_block_armv7,%function
+zfs_sha256_block_armv7:
+.Lzfs_sha256_block_armv7:
+
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+       sub     r3,pc,#8                @ zfs_sha256_block_armv7
+#else
+       adr     r3,.Lzfs_sha256_block_armv7
+#endif
+
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+       stmdb   sp!,{r0,r1,r2,r4-r11,lr}
+       ldmia   r0,{r4,r5,r6,r7,r8,r9,r10,r11}
+       sub     r14,r3,#256+32  @ K256
+       sub     sp,sp,#16*4             @ alloca(X[16])
+.Loop:
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                @ magic
+       eor     r12,r12,r12
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 0
+# if 0==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 0
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 0==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#0*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 0==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 0<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#2*4]            @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#15*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 1
+# if 1==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 1
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 1==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#1*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 1==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 1<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#3*4]            @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#0*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 2
+# if 2==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 2
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 2==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#2*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 2==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 2<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#4*4]            @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#1*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 3
+# if 3==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 3
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 3==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#3*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 3==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 3<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#5*4]            @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#2*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 4
+# if 4==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 4
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 4==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#4*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 4==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 4<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#6*4]            @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#3*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 5
+# if 5==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 5==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#5*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 5==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 5<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#7*4]            @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#4*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 6
+# if 6==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 6
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 6==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#6*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 6==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 6<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#8*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#5*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 7
+# if 7==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 7==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#7*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 7==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 7<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#9*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#6*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 8
+# if 8==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 8
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 8==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r8,r8,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#8*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 8==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 8<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#10*4]           @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#7*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 9
+# if 9==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 9
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 9==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r7,r7,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#9*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 9==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 9<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#11*4]           @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#8*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 10
+# if 10==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 10
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 10==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r6,r6,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#10*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 10==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 10<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#12*4]           @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#9*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 11
+# if 11==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 11
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 11==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r5,r5,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#11*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 11==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 11<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#13*4]           @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#10*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 12
+# if 12==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 12
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 12==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r4,r4,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#12*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 12==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 12<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#14*4]           @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#11*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 13
+# if 13==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 13
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 13==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r11,r11,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#13*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 13==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 13<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#15*4]           @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#12*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 14
+# if 14==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 14
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       ldrb    r12,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r12,lsl#8
+       ldrb    r12,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 14==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r10,r10,ror#5
+       orr     r2,r2,r12,lsl#24
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+#endif
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#14*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 14==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 14<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#0*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#13*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+#if __ARM_ARCH__>=7
+       @ ldr   r2,[r1],#4                      @ 15
+# if 15==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+# ifndef __ARMEB__
+       rev     r2,r2
+# endif
+#else
+       @ ldrb  r2,[r1,#3]                      @ 15
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       ldrb    r3,[r1,#2]
+       ldrb    r0,[r1,#1]
+       orr     r2,r2,r3,lsl#8
+       ldrb    r3,[r1],#4
+       orr     r2,r2,r0,lsl#16
+# if 15==15
+       str     r1,[sp,#17*4]                   @ make room for r1
+# endif
+       eor     r0,r9,r9,ror#5
+       orr     r2,r2,r3,lsl#24
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+#endif
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#15*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 15==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 15<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#1*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#14*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+.Lrounds_16_xx:
+       @ ldr   r2,[sp,#1*4]            @ 16
+       @ ldr   r1,[sp,#14*4]
+       mov     r0,r2,ror#7
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#0*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#9*4]
+
+       add     r12,r12,r0
+       eor     r0,r8,r8,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#0*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 16==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 16<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#2*4]            @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#15*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#2*4]            @ 17
+       @ ldr   r1,[sp,#15*4]
+       mov     r0,r2,ror#7
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#1*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#10*4]
+
+       add     r3,r3,r0
+       eor     r0,r7,r7,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#1*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 17==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 17<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#3*4]            @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#0*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#3*4]            @ 18
+       @ ldr   r1,[sp,#0*4]
+       mov     r0,r2,ror#7
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#2*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#11*4]
+
+       add     r12,r12,r0
+       eor     r0,r6,r6,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#2*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 18==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 18<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#4*4]            @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#1*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#4*4]            @ 19
+       @ ldr   r1,[sp,#1*4]
+       mov     r0,r2,ror#7
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#3*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#12*4]
+
+       add     r3,r3,r0
+       eor     r0,r5,r5,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#3*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 19==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 19<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#5*4]            @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#2*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#5*4]            @ 20
+       @ ldr   r1,[sp,#2*4]
+       mov     r0,r2,ror#7
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#4*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#13*4]
+
+       add     r12,r12,r0
+       eor     r0,r4,r4,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#4*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 20==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 20<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#6*4]            @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#3*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#6*4]            @ 21
+       @ ldr   r1,[sp,#3*4]
+       mov     r0,r2,ror#7
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#5*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#14*4]
+
+       add     r3,r3,r0
+       eor     r0,r11,r11,ror#5        @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#5*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 21==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 21<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#7*4]            @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#4*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#7*4]            @ 22
+       @ ldr   r1,[sp,#4*4]
+       mov     r0,r2,ror#7
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#6*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#15*4]
+
+       add     r12,r12,r0
+       eor     r0,r10,r10,ror#5        @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#6*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 22==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 22<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#8*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#5*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#8*4]            @ 23
+       @ ldr   r1,[sp,#5*4]
+       mov     r0,r2,ror#7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#7*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#0*4]
+
+       add     r3,r3,r0
+       eor     r0,r9,r9,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#7*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 23==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 23<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#9*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#6*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#9*4]            @ 24
+       @ ldr   r1,[sp,#6*4]
+       mov     r0,r2,ror#7
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#8*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#1*4]
+
+       add     r12,r12,r0
+       eor     r0,r8,r8,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r8,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r11,r11,r2                      @ h+=X[i]
+       str     r2,[sp,#8*4]
+       eor     r2,r9,r10
+       add     r11,r11,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r8
+       add     r11,r11,r12                     @ h+=K256[i]
+       eor     r2,r2,r10                       @ Ch(e,f,g)
+       eor     r0,r4,r4,ror#11
+       add     r11,r11,r2                      @ h+=Ch(e,f,g)
+#if 24==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 24<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#10*4]           @ from future BODY_16_xx
+       eor     r12,r4,r5                       @ a^b, b^c in next round
+       ldr     r1,[sp,#7*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r4,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r7,r7,r11                       @ d+=h
+       eor     r3,r3,r5                        @ Maj(a,b,c)
+       add     r11,r11,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r11,r11,r3                      @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#10*4]           @ 25
+       @ ldr   r1,[sp,#7*4]
+       mov     r0,r2,ror#7
+       add     r11,r11,r3                      @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#9*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#2*4]
+
+       add     r3,r3,r0
+       eor     r0,r7,r7,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r7,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r10,r10,r2                      @ h+=X[i]
+       str     r2,[sp,#9*4]
+       eor     r2,r8,r9
+       add     r10,r10,r0,ror#6        @ h+=Sigma1(e)
+       and     r2,r2,r7
+       add     r10,r10,r3                      @ h+=K256[i]
+       eor     r2,r2,r9                        @ Ch(e,f,g)
+       eor     r0,r11,r11,ror#11
+       add     r10,r10,r2                      @ h+=Ch(e,f,g)
+#if 25==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 25<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#11*4]           @ from future BODY_16_xx
+       eor     r3,r11,r4                       @ a^b, b^c in next round
+       ldr     r1,[sp,#8*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r11,ror#20        @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r6,r6,r10                       @ d+=h
+       eor     r12,r12,r4                      @ Maj(a,b,c)
+       add     r10,r10,r0,ror#2        @ h+=Sigma0(a)
+       @ add   r10,r10,r12                     @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#11*4]           @ 26
+       @ ldr   r1,[sp,#8*4]
+       mov     r0,r2,ror#7
+       add     r10,r10,r12                     @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#10*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#3*4]
+
+       add     r12,r12,r0
+       eor     r0,r6,r6,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r6,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r9,r9,r2                        @ h+=X[i]
+       str     r2,[sp,#10*4]
+       eor     r2,r7,r8
+       add     r9,r9,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r6
+       add     r9,r9,r12                       @ h+=K256[i]
+       eor     r2,r2,r8                        @ Ch(e,f,g)
+       eor     r0,r10,r10,ror#11
+       add     r9,r9,r2                        @ h+=Ch(e,f,g)
+#if 26==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 26<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#12*4]           @ from future BODY_16_xx
+       eor     r12,r10,r11                     @ a^b, b^c in next round
+       ldr     r1,[sp,#9*4]    @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r10,ror#20        @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r5,r5,r9                        @ d+=h
+       eor     r3,r3,r11                       @ Maj(a,b,c)
+       add     r9,r9,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r9,r9,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#12*4]           @ 27
+       @ ldr   r1,[sp,#9*4]
+       mov     r0,r2,ror#7
+       add     r9,r9,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#11*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#4*4]
+
+       add     r3,r3,r0
+       eor     r0,r5,r5,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r5,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r8,r8,r2                        @ h+=X[i]
+       str     r2,[sp,#11*4]
+       eor     r2,r6,r7
+       add     r8,r8,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r5
+       add     r8,r8,r3                        @ h+=K256[i]
+       eor     r2,r2,r7                        @ Ch(e,f,g)
+       eor     r0,r9,r9,ror#11
+       add     r8,r8,r2                        @ h+=Ch(e,f,g)
+#if 27==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 27<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#13*4]           @ from future BODY_16_xx
+       eor     r3,r9,r10                       @ a^b, b^c in next round
+       ldr     r1,[sp,#10*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r9,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r4,r4,r8                        @ d+=h
+       eor     r12,r12,r10                     @ Maj(a,b,c)
+       add     r8,r8,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r8,r8,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#13*4]           @ 28
+       @ ldr   r1,[sp,#10*4]
+       mov     r0,r2,ror#7
+       add     r8,r8,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#12*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#5*4]
+
+       add     r12,r12,r0
+       eor     r0,r4,r4,ror#5  @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r4,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r7,r7,r2                        @ h+=X[i]
+       str     r2,[sp,#12*4]
+       eor     r2,r5,r6
+       add     r7,r7,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r4
+       add     r7,r7,r12                       @ h+=K256[i]
+       eor     r2,r2,r6                        @ Ch(e,f,g)
+       eor     r0,r8,r8,ror#11
+       add     r7,r7,r2                        @ h+=Ch(e,f,g)
+#if 28==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 28<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#14*4]           @ from future BODY_16_xx
+       eor     r12,r8,r9                       @ a^b, b^c in next round
+       ldr     r1,[sp,#11*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r8,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r11,r11,r7                      @ d+=h
+       eor     r3,r3,r9                        @ Maj(a,b,c)
+       add     r7,r7,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r7,r7,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#14*4]           @ 29
+       @ ldr   r1,[sp,#11*4]
+       mov     r0,r2,ror#7
+       add     r7,r7,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#13*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#6*4]
+
+       add     r3,r3,r0
+       eor     r0,r11,r11,ror#5        @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r11,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r6,r6,r2                        @ h+=X[i]
+       str     r2,[sp,#13*4]
+       eor     r2,r4,r5
+       add     r6,r6,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r11
+       add     r6,r6,r3                        @ h+=K256[i]
+       eor     r2,r2,r5                        @ Ch(e,f,g)
+       eor     r0,r7,r7,ror#11
+       add     r6,r6,r2                        @ h+=Ch(e,f,g)
+#if 29==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 29<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#15*4]           @ from future BODY_16_xx
+       eor     r3,r7,r8                        @ a^b, b^c in next round
+       ldr     r1,[sp,#12*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r7,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r10,r10,r6                      @ d+=h
+       eor     r12,r12,r8                      @ Maj(a,b,c)
+       add     r6,r6,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r6,r6,r12                       @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#15*4]           @ 30
+       @ ldr   r1,[sp,#12*4]
+       mov     r0,r2,ror#7
+       add     r6,r6,r12                       @ h+=Maj(a,b,c) from the past
+       mov     r12,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r12,r12,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#14*4]
+       eor     r12,r12,r1,lsr#10       @ sigma1(X[i+14])
+       ldr     r1,[sp,#7*4]
+
+       add     r12,r12,r0
+       eor     r0,r10,r10,ror#5        @ from BODY_00_15
+       add     r2,r2,r12
+       eor     r0,r0,r10,ror#19        @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r12,[r14],#4                    @ *K256++
+       add     r5,r5,r2                        @ h+=X[i]
+       str     r2,[sp,#14*4]
+       eor     r2,r11,r4
+       add     r5,r5,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r10
+       add     r5,r5,r12                       @ h+=K256[i]
+       eor     r2,r2,r4                        @ Ch(e,f,g)
+       eor     r0,r6,r6,ror#11
+       add     r5,r5,r2                        @ h+=Ch(e,f,g)
+#if 30==31
+       and     r12,r12,#0xff
+       cmp     r12,#0xf2                       @ done?
+#endif
+#if 30<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#0*4]            @ from future BODY_16_xx
+       eor     r12,r6,r7                       @ a^b, b^c in next round
+       ldr     r1,[sp,#13*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r6,ror#20 @ Sigma0(a)
+       and     r3,r3,r12                       @ (b^c)&=(a^b)
+       add     r9,r9,r5                        @ d+=h
+       eor     r3,r3,r7                        @ Maj(a,b,c)
+       add     r5,r5,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r5,r5,r3                        @ h+=Maj(a,b,c)
+       @ ldr   r2,[sp,#0*4]            @ 31
+       @ ldr   r1,[sp,#13*4]
+       mov     r0,r2,ror#7
+       add     r5,r5,r3                        @ h+=Maj(a,b,c) from the past
+       mov     r3,r1,ror#17
+       eor     r0,r0,r2,ror#18
+       eor     r3,r3,r1,ror#19
+       eor     r0,r0,r2,lsr#3  @ sigma0(X[i+1])
+       ldr     r2,[sp,#15*4]
+       eor     r3,r3,r1,lsr#10 @ sigma1(X[i+14])
+       ldr     r1,[sp,#8*4]
+
+       add     r3,r3,r0
+       eor     r0,r9,r9,ror#5  @ from BODY_00_15
+       add     r2,r2,r3
+       eor     r0,r0,r9,ror#19 @ Sigma1(e)
+       add     r2,r2,r1                        @ X[i]
+       ldr     r3,[r14],#4                     @ *K256++
+       add     r4,r4,r2                        @ h+=X[i]
+       str     r2,[sp,#15*4]
+       eor     r2,r10,r11
+       add     r4,r4,r0,ror#6  @ h+=Sigma1(e)
+       and     r2,r2,r9
+       add     r4,r4,r3                        @ h+=K256[i]
+       eor     r2,r2,r11                       @ Ch(e,f,g)
+       eor     r0,r5,r5,ror#11
+       add     r4,r4,r2                        @ h+=Ch(e,f,g)
+#if 31==31
+       and     r3,r3,#0xff
+       cmp     r3,#0xf2                        @ done?
+#endif
+#if 31<15
+# if __ARM_ARCH__>=7
+       ldr     r2,[r1],#4                      @ prefetch
+# else
+       ldrb    r2,[r1,#3]
+# endif
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+#else
+       ldr     r2,[sp,#1*4]            @ from future BODY_16_xx
+       eor     r3,r5,r6                        @ a^b, b^c in next round
+       ldr     r1,[sp,#14*4]   @ from future BODY_16_xx
+#endif
+       eor     r0,r0,r5,ror#20 @ Sigma0(a)
+       and     r12,r12,r3                      @ (b^c)&=(a^b)
+       add     r8,r8,r4                        @ d+=h
+       eor     r12,r12,r6                      @ Maj(a,b,c)
+       add     r4,r4,r0,ror#2  @ h+=Sigma0(a)
+       @ add   r4,r4,r12                       @ h+=Maj(a,b,c)
+#ifdef __thumb2__
+       ite     eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   r3,[sp,#16*4]           @ pull ctx
+       bne     .Lrounds_16_xx
+
+       add     r4,r4,r12               @ h+=Maj(a,b,c) from the past
+       ldr     r0,[r3,#0]
+       ldr     r2,[r3,#4]
+       ldr     r12,[r3,#8]
+       add     r4,r4,r0
+       ldr     r0,[r3,#12]
+       add     r5,r5,r2
+       ldr     r2,[r3,#16]
+       add     r6,r6,r12
+       ldr     r12,[r3,#20]
+       add     r7,r7,r0
+       ldr     r0,[r3,#24]
+       add     r8,r8,r2
+       ldr     r2,[r3,#28]
+       add     r9,r9,r12
+       ldr     r1,[sp,#17*4]           @ pull inp
+       ldr     r12,[sp,#18*4]          @ pull inp+len
+       add     r10,r10,r0
+       add     r11,r11,r2
+       stmia   r3,{r4,r5,r6,r7,r8,r9,r10,r11}
+       cmp     r1,r12
+       sub     r14,r14,#256    @ rewind Ktbl
+       bne     .Loop
+
+       add     sp,sp,#19*4     @ destroy frame
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
+       ldmia   sp!,{r4-r11,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       .word   0xe12fff1e                      @ interoperable with Thumb ISA:-)
+#endif
+.size  zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
+
+.arch  armv7-a
+.fpu   neon
+
+.globl zfs_sha256_block_neon
+.type  zfs_sha256_block_neon,%function
+.align 5
+.skip  16
+zfs_sha256_block_neon:
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       sub     r11,sp,#16*4+16
+       adr     r14,K256
+       bic     r11,r11,#15             @ align for 128-bit stores
+       mov     r12,sp
+       mov     sp,r11                  @ alloca
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+
+       vld1.8          {q0},[r1]!
+       vld1.8          {q1},[r1]!
+       vld1.8          {q2},[r1]!
+       vld1.8          {q3},[r1]!
+       vld1.32         {q8},[r14,:128]!
+       vld1.32         {q9},[r14,:128]!
+       vld1.32         {q10},[r14,:128]!
+       vld1.32         {q11},[r14,:128]!
+       vrev32.8        q0,q0           @ yes, even on
+       str             r0,[sp,#64]
+       vrev32.8        q1,q1           @ big-endian
+       str             r1,[sp,#68]
+       mov             r1,sp
+       vrev32.8        q2,q2
+       str             r2,[sp,#72]
+       vrev32.8        q3,q3
+       str             r12,[sp,#76]            @ save original sp
+       vadd.i32        q8,q8,q0
+       vadd.i32        q9,q9,q1
+       vst1.32         {q8},[r1,:128]!
+       vadd.i32        q10,q10,q2
+       vst1.32         {q9},[r1,:128]!
+       vadd.i32        q11,q11,q3
+       vst1.32         {q10},[r1,:128]!
+       vst1.32         {q11},[r1,:128]!
+
+       ldmia           r0,{r4-r11}
+       sub             r1,r1,#64
+       ldr             r2,[sp,#0]
+       eor             r12,r12,r12
+       eor             r3,r5,r6
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+       vext.8  q8,q0,q1,#4
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       vext.8  q9,q2,q3,#4
+       add     r4,r4,r12
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vadd.i32        q0,q0,q9
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#4]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       veor    q9,q9,q10
+       add     r10,r10,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       vshr.u32        d24,d7,#17
+       add     r11,r11,r3
+       and     r2,r2,r7
+       veor    q9,q9,q11
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       vsli.32 d24,d7,#15
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       vshr.u32        d25,d7,#10
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       vadd.i32        q0,q0,q9
+       add     r10,r10,r2
+       ldr     r2,[sp,#8]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r6,r6,r10
+       vshr.u32        d24,d7,#19
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       vsli.32 d24,d7,#13
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       veor    d25,d25,d24
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       vadd.i32        d0,d0,d25
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       vshr.u32        d24,d0,#17
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       vsli.32 d24,d0,#15
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       vshr.u32        d25,d0,#10
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#12]
+       and     r3,r3,r12
+       vshr.u32        d24,d0,#19
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       vld1.32 {q8},[r14,:128]!
+       add     r8,r8,r2
+       vsli.32 d24,d0,#13
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       veor    d25,d25,d24
+       add     r9,r9,r3
+       and     r2,r2,r5
+       vadd.i32        d1,d1,d25
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       vadd.i32        q8,q8,q0
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#16]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       vst1.32 {q8},[r1,:128]!
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vext.8  q8,q1,q2,#4
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       vext.8  q9,q3,q0,#4
+       add     r8,r8,r12
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vadd.i32        q1,q1,q9
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#20]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       veor    q9,q9,q10
+       add     r6,r6,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       vshr.u32        d24,d1,#17
+       add     r7,r7,r3
+       and     r2,r2,r11
+       veor    q9,q9,q11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       vsli.32 d24,d1,#15
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       vshr.u32        d25,d1,#10
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       vadd.i32        q1,q1,q9
+       add     r6,r6,r2
+       ldr     r2,[sp,#24]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r10,r10,r6
+       vshr.u32        d24,d1,#19
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       vsli.32 d24,d1,#13
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       veor    d25,d25,d24
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       vadd.i32        d2,d2,d25
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       vshr.u32        d24,d2,#17
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       vsli.32 d24,d2,#15
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       vshr.u32        d25,d2,#10
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#28]
+       and     r3,r3,r12
+       vshr.u32        d24,d2,#19
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       vld1.32 {q8},[r14,:128]!
+       add     r4,r4,r2
+       vsli.32 d24,d2,#13
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       veor    d25,d25,d24
+       add     r5,r5,r3
+       and     r2,r2,r9
+       vadd.i32        d3,d3,d25
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       vadd.i32        q8,q8,q1
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#32]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       vst1.32 {q8},[r1,:128]!
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vext.8  q8,q2,q3,#4
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       vext.8  q9,q0,q1,#4
+       add     r4,r4,r12
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vadd.i32        q2,q2,q9
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#36]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       veor    q9,q9,q10
+       add     r10,r10,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       vshr.u32        d24,d3,#17
+       add     r11,r11,r3
+       and     r2,r2,r7
+       veor    q9,q9,q11
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       vsli.32 d24,d3,#15
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       vshr.u32        d25,d3,#10
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       vadd.i32        q2,q2,q9
+       add     r10,r10,r2
+       ldr     r2,[sp,#40]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r6,r6,r10
+       vshr.u32        d24,d3,#19
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       vsli.32 d24,d3,#13
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       veor    d25,d25,d24
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       vadd.i32        d4,d4,d25
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       vshr.u32        d24,d4,#17
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       vsli.32 d24,d4,#15
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       vshr.u32        d25,d4,#10
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#44]
+       and     r3,r3,r12
+       vshr.u32        d24,d4,#19
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       vld1.32 {q8},[r14,:128]!
+       add     r8,r8,r2
+       vsli.32 d24,d4,#13
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       veor    d25,d25,d24
+       add     r9,r9,r3
+       and     r2,r2,r5
+       vadd.i32        d5,d5,d25
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       vadd.i32        q8,q8,q2
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#48]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       vst1.32 {q8},[r1,:128]!
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vext.8  q8,q3,q0,#4
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       vext.8  q9,q1,q2,#4
+       add     r8,r8,r12
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       vshr.u32        q10,q8,#7
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vadd.i32        q3,q3,q9
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       vshr.u32        q9,q8,#3
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vsli.32 q10,q8,#25
+       ldr     r2,[sp,#52]
+       and     r3,r3,r12
+       vshr.u32        q11,q8,#18
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       veor    q9,q9,q10
+       add     r6,r6,r2
+       vsli.32 q11,q8,#14
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       vshr.u32        d24,d5,#17
+       add     r7,r7,r3
+       and     r2,r2,r11
+       veor    q9,q9,q11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       vsli.32 d24,d5,#15
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       vshr.u32        d25,d5,#10
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       vadd.i32        q3,q3,q9
+       add     r6,r6,r2
+       ldr     r2,[sp,#56]
+       veor    d25,d25,d24
+       and     r12,r12,r3
+       add     r10,r10,r6
+       vshr.u32        d24,d5,#19
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       vsli.32 d24,d5,#13
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       veor    d25,d25,d24
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       vadd.i32        d6,d6,d25
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       vshr.u32        d24,d6,#17
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       vsli.32 d24,d6,#15
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       vshr.u32        d25,d6,#10
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       veor    d25,d25,d24
+       ldr     r2,[sp,#60]
+       and     r3,r3,r12
+       vshr.u32        d24,d6,#19
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       vld1.32 {q8},[r14,:128]!
+       add     r4,r4,r2
+       vsli.32 d24,d6,#13
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       veor    d25,d25,d24
+       add     r5,r5,r3
+       and     r2,r2,r9
+       vadd.i32        d7,d7,d25
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       vadd.i32        q8,q8,q3
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[r14]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       vst1.32 {q8},[r1,:128]!
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       teq     r2,#0                           @ check for K256 terminator
+       ldr     r2,[sp,#0]
+       sub     r1,r1,#64
+       bne     .L_00_48
+
+       ldr             r1,[sp,#68]
+       ldr             r0,[sp,#72]
+       sub             r14,r14,#256    @ rewind r14
+       teq             r1,r0
+       it              eq
+       subeq           r1,r1,#64               @ avoid SEGV
+       vld1.8          {q0},[r1]!              @ load next input block
+       vld1.8          {q1},[r1]!
+       vld1.8          {q2},[r1]!
+       vld1.8          {q3},[r1]!
+       it              ne
+       strne           r1,[sp,#68]
+       mov             r1,sp
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vrev32.8        q0,q0
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vadd.i32        q8,q8,q0
+       ldr     r2,[sp,#4]
+       and     r3,r3,r12
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       add     r10,r10,r2
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3
+       and     r2,r2,r7
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       add     r10,r10,r2
+       ldr     r2,[sp,#8]
+       and     r12,r12,r3
+       add     r6,r6,r10
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       ldr     r2,[sp,#12]
+       and     r3,r3,r12
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       add     r8,r8,r2
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3
+       and     r2,r2,r5
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#16]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vst1.32 {q8},[r1,:128]!
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vrev32.8        q1,q1
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vadd.i32        q8,q8,q1
+       ldr     r2,[sp,#20]
+       and     r3,r3,r12
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       add     r6,r6,r2
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3
+       and     r2,r2,r11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       add     r6,r6,r2
+       ldr     r2,[sp,#24]
+       and     r12,r12,r3
+       add     r10,r10,r6
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       ldr     r2,[sp,#28]
+       and     r3,r3,r12
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       add     r4,r4,r2
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3
+       and     r2,r2,r9
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#32]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vst1.32 {q8},[r1,:128]!
+       add     r11,r11,r2
+       eor     r2,r9,r10
+       eor     r0,r8,r8,ror#5
+       add     r4,r4,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r8
+       eor     r12,r0,r8,ror#19
+       eor     r0,r4,r4,ror#11
+       eor     r2,r2,r10
+       vrev32.8        q2,q2
+       add     r11,r11,r12,ror#6
+       eor     r12,r4,r5
+       eor     r0,r0,r4,ror#20
+       add     r11,r11,r2
+       vadd.i32        q8,q8,q2
+       ldr     r2,[sp,#36]
+       and     r3,r3,r12
+       add     r7,r7,r11
+       add     r11,r11,r0,ror#2
+       eor     r3,r3,r5
+       add     r10,r10,r2
+       eor     r2,r8,r9
+       eor     r0,r7,r7,ror#5
+       add     r11,r11,r3
+       and     r2,r2,r7
+       eor     r3,r0,r7,ror#19
+       eor     r0,r11,r11,ror#11
+       eor     r2,r2,r9
+       add     r10,r10,r3,ror#6
+       eor     r3,r11,r4
+       eor     r0,r0,r11,ror#20
+       add     r10,r10,r2
+       ldr     r2,[sp,#40]
+       and     r12,r12,r3
+       add     r6,r6,r10
+       add     r10,r10,r0,ror#2
+       eor     r12,r12,r4
+       add     r9,r9,r2
+       eor     r2,r7,r8
+       eor     r0,r6,r6,ror#5
+       add     r10,r10,r12
+       and     r2,r2,r6
+       eor     r12,r0,r6,ror#19
+       eor     r0,r10,r10,ror#11
+       eor     r2,r2,r8
+       add     r9,r9,r12,ror#6
+       eor     r12,r10,r11
+       eor     r0,r0,r10,ror#20
+       add     r9,r9,r2
+       ldr     r2,[sp,#44]
+       and     r3,r3,r12
+       add     r5,r5,r9
+       add     r9,r9,r0,ror#2
+       eor     r3,r3,r11
+       add     r8,r8,r2
+       eor     r2,r6,r7
+       eor     r0,r5,r5,ror#5
+       add     r9,r9,r3
+       and     r2,r2,r5
+       eor     r3,r0,r5,ror#19
+       eor     r0,r9,r9,ror#11
+       eor     r2,r2,r7
+       add     r8,r8,r3,ror#6
+       eor     r3,r9,r10
+       eor     r0,r0,r9,ror#20
+       add     r8,r8,r2
+       ldr     r2,[sp,#48]
+       and     r12,r12,r3
+       add     r4,r4,r8
+       add     r8,r8,r0,ror#2
+       eor     r12,r12,r10
+       vst1.32 {q8},[r1,:128]!
+       add     r7,r7,r2
+       eor     r2,r5,r6
+       eor     r0,r4,r4,ror#5
+       add     r8,r8,r12
+       vld1.32 {q8},[r14,:128]!
+       and     r2,r2,r4
+       eor     r12,r0,r4,ror#19
+       eor     r0,r8,r8,ror#11
+       eor     r2,r2,r6
+       vrev32.8        q3,q3
+       add     r7,r7,r12,ror#6
+       eor     r12,r8,r9
+       eor     r0,r0,r8,ror#20
+       add     r7,r7,r2
+       vadd.i32        q8,q8,q3
+       ldr     r2,[sp,#52]
+       and     r3,r3,r12
+       add     r11,r11,r7
+       add     r7,r7,r0,ror#2
+       eor     r3,r3,r9
+       add     r6,r6,r2
+       eor     r2,r4,r5
+       eor     r0,r11,r11,ror#5
+       add     r7,r7,r3
+       and     r2,r2,r11
+       eor     r3,r0,r11,ror#19
+       eor     r0,r7,r7,ror#11
+       eor     r2,r2,r5
+       add     r6,r6,r3,ror#6
+       eor     r3,r7,r8
+       eor     r0,r0,r7,ror#20
+       add     r6,r6,r2
+       ldr     r2,[sp,#56]
+       and     r12,r12,r3
+       add     r10,r10,r6
+       add     r6,r6,r0,ror#2
+       eor     r12,r12,r8
+       add     r5,r5,r2
+       eor     r2,r11,r4
+       eor     r0,r10,r10,ror#5
+       add     r6,r6,r12
+       and     r2,r2,r10
+       eor     r12,r0,r10,ror#19
+       eor     r0,r6,r6,ror#11
+       eor     r2,r2,r4
+       add     r5,r5,r12,ror#6
+       eor     r12,r6,r7
+       eor     r0,r0,r6,ror#20
+       add     r5,r5,r2
+       ldr     r2,[sp,#60]
+       and     r3,r3,r12
+       add     r9,r9,r5
+       add     r5,r5,r0,ror#2
+       eor     r3,r3,r7
+       add     r4,r4,r2
+       eor     r2,r10,r11
+       eor     r0,r9,r9,ror#5
+       add     r5,r5,r3
+       and     r2,r2,r9
+       eor     r3,r0,r9,ror#19
+       eor     r0,r5,r5,ror#11
+       eor     r2,r2,r11
+       add     r4,r4,r3,ror#6
+       eor     r3,r5,r6
+       eor     r0,r0,r5,ror#20
+       add     r4,r4,r2
+       ldr     r2,[sp,#64]
+       and     r12,r12,r3
+       add     r8,r8,r4
+       add     r4,r4,r0,ror#2
+       eor     r12,r12,r6
+       vst1.32 {q8},[r1,:128]!
+       ldr     r0,[r2,#0]
+       add     r4,r4,r12                       @ h+=Maj(a,b,c) from the past
+       ldr     r12,[r2,#4]
+       ldr     r3,[r2,#8]
+       ldr     r1,[r2,#12]
+       add     r4,r4,r0                        @ accumulate
+       ldr     r0,[r2,#16]
+       add     r5,r5,r12
+       ldr     r12,[r2,#20]
+       add     r6,r6,r3
+       ldr     r3,[r2,#24]
+       add     r7,r7,r1
+       ldr     r1,[r2,#28]
+       add     r8,r8,r0
+       str     r4,[r2],#4
+       add     r9,r9,r12
+       str     r5,[r2],#4
+       add     r10,r10,r3
+       str     r6,[r2],#4
+       add     r11,r11,r1
+       str     r7,[r2],#4
+       stmia   r2,{r8-r11}
+
+       ittte   ne
+       movne   r1,sp
+       ldrne   r2,[sp,#0]
+       eorne   r12,r12,r12
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       itt     ne
+       eorne   r3,r5,r6
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+.size  zfs_sha256_block_neon,.-zfs_sha256_block_neon
+
+# if defined(__thumb2__)
+#  define INST(a,b,c,d)        .byte   c,d|0xc,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d
+# endif
+
+.globl zfs_sha256_block_armv8
+.type  zfs_sha256_block_armv8,%function
+.align 5
+zfs_sha256_block_armv8:
+.LARMv8:
+       vld1.32 {q0,q1},[r0]
+       sub     r3,r3,#256+32
+       add     r2,r1,r2,lsl#6  @ len to point at the end of inp
+       b       .Loop_v8
+
+.align 4
+.Loop_v8:
+       vld1.8          {q8-q9},[r1]!
+       vld1.8          {q10-q11},[r1]!
+       vld1.32         {q12},[r3]!
+       vrev32.8        q8,q8
+       vrev32.8        q9,q9
+       vrev32.8        q10,q10
+       vrev32.8        q11,q11
+       vmov            q14,q0  @ offload
+       vmov            q15,q1
+       teq             r1,r2
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       INST(0xe2,0x03,0xfa,0xf3)       @ sha256su0 q8,q9
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe6,0x0c,0x64,0xf3)       @ sha256su1 q8,q10,q11
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       INST(0xe4,0x23,0xfa,0xf3)       @ sha256su0 q9,q10
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe0,0x2c,0x66,0xf3)       @ sha256su1 q9,q11,q8
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q10
+       INST(0xe6,0x43,0xfa,0xf3)       @ sha256su0 q10,q11
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+       INST(0xe2,0x4c,0x60,0xf3)       @ sha256su1 q10,q8,q9
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q11
+       INST(0xe0,0x63,0xfa,0xf3)       @ sha256su0 q11,q8
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+       INST(0xe4,0x6c,0x62,0xf3)       @ sha256su1 q11,q9,q10
+       vld1.32         {q13},[r3]!
+       vadd.i32        q12,q12,q8
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+
+       vld1.32         {q12},[r3]!
+       vadd.i32        q13,q13,q9
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+
+       vld1.32         {q13},[r3]
+       vadd.i32        q12,q12,q10
+       sub             r3,r3,#256-16   @ rewind
+       vmov            q2,q0
+       INST(0x68,0x0c,0x02,0xf3)       @ sha256h q0,q1,q12
+       INST(0x68,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q12
+
+       vadd.i32        q13,q13,q11
+       vmov            q2,q0
+       INST(0x6a,0x0c,0x02,0xf3)       @ sha256h q0,q1,q13
+       INST(0x6a,0x2c,0x14,0xf3)       @ sha256h2 q1,q2,q13
+
+       vadd.i32        q0,q0,q14
+       vadd.i32        q1,q1,q15
+       it              ne
+       bne             .Loop_v8
+
+       vst1.32         {q0,q1},[r0]
+
+       bx      lr              @ bx lr
+.size  zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
+
+#endif
diff --git a/module/icp/asm-arm/sha2/sha512-armv7.S b/module/icp/asm-arm/sha2/sha512-armv7.S
new file mode 100644 (file)
index 0000000..a4c8040
--- /dev/null
@@ -0,0 +1,1822 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__arm__)
+
+#define        __ARM_ARCH__      7
+#define        __ARM_MAX_ARCH__  7
+
+#ifndef __KERNEL__
+# define VFP_ABI_PUSH  vstmdb  sp!,{d8-d15}
+# define VFP_ABI_POP   vldmia  sp!,{d8-d15}
+#else
+# define VFP_ABI_PUSH
+# define VFP_ABI_POP
+#endif
+
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)       .word   lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)       .word   hi0,lo0, hi1,lo1
+#endif
+
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+# define adrl adr
+#else
+.code  32
+#endif
+
+.text
+
+.type  K512,%object
+.align 5
+K512:
+       WORD64(0x428a2f98,0xd728ae22,   0x71374491,0x23ef65cd)
+       WORD64(0xb5c0fbcf,0xec4d3b2f,   0xe9b5dba5,0x8189dbbc)
+       WORD64(0x3956c25b,0xf348b538,   0x59f111f1,0xb605d019)
+       WORD64(0x923f82a4,0xaf194f9b,   0xab1c5ed5,0xda6d8118)
+       WORD64(0xd807aa98,0xa3030242,   0x12835b01,0x45706fbe)
+       WORD64(0x243185be,0x4ee4b28c,   0x550c7dc3,0xd5ffb4e2)
+       WORD64(0x72be5d74,0xf27b896f,   0x80deb1fe,0x3b1696b1)
+       WORD64(0x9bdc06a7,0x25c71235,   0xc19bf174,0xcf692694)
+       WORD64(0xe49b69c1,0x9ef14ad2,   0xefbe4786,0x384f25e3)
+       WORD64(0x0fc19dc6,0x8b8cd5b5,   0x240ca1cc,0x77ac9c65)
+       WORD64(0x2de92c6f,0x592b0275,   0x4a7484aa,0x6ea6e483)
+       WORD64(0x5cb0a9dc,0xbd41fbd4,   0x76f988da,0x831153b5)
+       WORD64(0x983e5152,0xee66dfab,   0xa831c66d,0x2db43210)
+       WORD64(0xb00327c8,0x98fb213f,   0xbf597fc7,0xbeef0ee4)
+       WORD64(0xc6e00bf3,0x3da88fc2,   0xd5a79147,0x930aa725)
+       WORD64(0x06ca6351,0xe003826f,   0x14292967,0x0a0e6e70)
+       WORD64(0x27b70a85,0x46d22ffc,   0x2e1b2138,0x5c26c926)
+       WORD64(0x4d2c6dfc,0x5ac42aed,   0x53380d13,0x9d95b3df)
+       WORD64(0x650a7354,0x8baf63de,   0x766a0abb,0x3c77b2a8)
+       WORD64(0x81c2c92e,0x47edaee6,   0x92722c85,0x1482353b)
+       WORD64(0xa2bfe8a1,0x4cf10364,   0xa81a664b,0xbc423001)
+       WORD64(0xc24b8b70,0xd0f89791,   0xc76c51a3,0x0654be30)
+       WORD64(0xd192e819,0xd6ef5218,   0xd6990624,0x5565a910)
+       WORD64(0xf40e3585,0x5771202a,   0x106aa070,0x32bbd1b8)
+       WORD64(0x19a4c116,0xb8d2d0c8,   0x1e376c08,0x5141ab53)
+       WORD64(0x2748774c,0xdf8eeb99,   0x34b0bcb5,0xe19b48a8)
+       WORD64(0x391c0cb3,0xc5c95a63,   0x4ed8aa4a,0xe3418acb)
+       WORD64(0x5b9cca4f,0x7763e373,   0x682e6ff3,0xd6b2b8a3)
+       WORD64(0x748f82ee,0x5defb2fc,   0x78a5636f,0x43172f60)
+       WORD64(0x84c87814,0xa1f0ab72,   0x8cc70208,0x1a6439ec)
+       WORD64(0x90befffa,0x23631e28,   0xa4506ceb,0xde82bde9)
+       WORD64(0xbef9a3f7,0xb2c67915,   0xc67178f2,0xe372532b)
+       WORD64(0xca273ece,0xea26619c,   0xd186b8c7,0x21c0c207)
+       WORD64(0xeada7dd6,0xcde0eb1e,   0xf57d4f7f,0xee6ed178)
+       WORD64(0x06f067aa,0x72176fba,   0x0a637dc5,0xa2c898a6)
+       WORD64(0x113f9804,0xbef90dae,   0x1b710b35,0x131c471b)
+       WORD64(0x28db77f5,0x23047d84,   0x32caab7b,0x40c72493)
+       WORD64(0x3c9ebe0a,0x15c9bebc,   0x431d67c4,0x9c100d4c)
+       WORD64(0x4cc5d4be,0xcb3e42b6,   0x597f299c,0xfc657e2a)
+       WORD64(0x5fcb6fab,0x3ad6faec,   0x6c44198c,0x4a475817)
+.size  K512,.-K512
+.word  0                               @ terminator
+
+.align 5
+.globl zfs_sha512_block_armv7
+.type  zfs_sha512_block_armv7,%function
+zfs_sha512_block_armv7:
+.Lzfs_sha512_block_armv7:
+
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
+       sub     r3,pc,#8                @ zfs_sha512_block_armv7
+#else
+       adr     r3,.Lzfs_sha512_block_armv7
+#endif
+
+       add     r2,r1,r2,lsl#7  @ len to point at the end of inp
+       stmdb   sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+       sub     r14,r3,#672             @ K512
+       sub     sp,sp,#9*8
+
+       ldr     r7,[r0,#32+LO]
+       ldr     r8,[r0,#32+HI]
+       ldr     r9, [r0,#48+LO]
+       ldr     r10, [r0,#48+HI]
+       ldr     r11, [r0,#56+LO]
+       ldr     r12, [r0,#56+HI]
+.Loop:
+       str     r9, [sp,#48+0]
+       str     r10, [sp,#48+4]
+       str     r11, [sp,#56+0]
+       str     r12, [sp,#56+4]
+       ldr     r5,[r0,#0+LO]
+       ldr     r6,[r0,#0+HI]
+       ldr     r3,[r0,#8+LO]
+       ldr     r4,[r0,#8+HI]
+       ldr     r9, [r0,#16+LO]
+       ldr     r10, [r0,#16+HI]
+       ldr     r11, [r0,#24+LO]
+       ldr     r12, [r0,#24+HI]
+       str     r3,[sp,#8+0]
+       str     r4,[sp,#8+4]
+       str     r9, [sp,#16+0]
+       str     r10, [sp,#16+4]
+       str     r11, [sp,#24+0]
+       str     r12, [sp,#24+4]
+       ldr     r3,[r0,#40+LO]
+       ldr     r4,[r0,#40+HI]
+       str     r3,[sp,#40+0]
+       str     r4,[sp,#40+4]
+
+.L00_15:
+#if __ARM_ARCH__<7
+       ldrb    r3,[r1,#7]
+       ldrb    r9, [r1,#6]
+       ldrb    r10, [r1,#5]
+       ldrb    r11, [r1,#4]
+       ldrb    r4,[r1,#3]
+       ldrb    r12, [r1,#2]
+       orr     r3,r3,r9,lsl#8
+       ldrb    r9, [r1,#1]
+       orr     r3,r3,r10,lsl#16
+       ldrb    r10, [r1],#8
+       orr     r3,r3,r11,lsl#24
+       orr     r4,r4,r12,lsl#8
+       orr     r4,r4,r9,lsl#16
+       orr     r4,r4,r10,lsl#24
+#else
+       ldr     r3,[r1,#4]
+       ldr     r4,[r1],#8
+#ifdef __ARMEL__
+       rev     r3,r3
+       rev     r4,r4
+#endif
+#endif
+       @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+       @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+       @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+       mov     r9,r7,lsr#14
+       str     r3,[sp,#64+0]
+       mov     r10,r8,lsr#14
+       str     r4,[sp,#64+4]
+       eor     r9,r9,r8,lsl#18
+       ldr     r11,[sp,#56+0]  @ h.lo
+       eor     r10,r10,r7,lsl#18
+       ldr     r12,[sp,#56+4]  @ h.hi
+       eor     r9,r9,r7,lsr#18
+       eor     r10,r10,r8,lsr#18
+       eor     r9,r9,r8,lsl#14
+       eor     r10,r10,r7,lsl#14
+       eor     r9,r9,r8,lsr#9
+       eor     r10,r10,r7,lsr#9
+       eor     r9,r9,r7,lsl#23
+       eor     r10,r10,r8,lsl#23       @ Sigma1(e)
+       adds    r3,r3,r9
+       ldr     r9,[sp,#40+0]   @ f.lo
+       adc     r4,r4,r10               @ T += Sigma1(e)
+       ldr     r10,[sp,#40+4]  @ f.hi
+       adds    r3,r3,r11
+       ldr     r11,[sp,#48+0]  @ g.lo
+       adc     r4,r4,r12               @ T += h
+       ldr     r12,[sp,#48+4]  @ g.hi
+
+       eor     r9,r9,r11
+       str     r7,[sp,#32+0]
+       eor     r10,r10,r12
+       str     r8,[sp,#32+4]
+       and     r9,r9,r7
+       str     r5,[sp,#0+0]
+       and     r10,r10,r8
+       str     r6,[sp,#0+4]
+       eor     r9,r9,r11
+       ldr     r11,[r14,#LO]   @ K[i].lo
+       eor     r10,r10,r12             @ Ch(e,f,g)
+       ldr     r12,[r14,#HI]   @ K[i].hi
+
+       adds    r3,r3,r9
+       ldr     r7,[sp,#24+0]   @ d.lo
+       adc     r4,r4,r10               @ T += Ch(e,f,g)
+       ldr     r8,[sp,#24+4]   @ d.hi
+       adds    r3,r3,r11
+       and     r9,r11,#0xff
+       adc     r4,r4,r12               @ T += K[i]
+       adds    r7,r7,r3
+       ldr     r11,[sp,#8+0]   @ b.lo
+       adc     r8,r8,r4                @ d += T
+       teq     r9,#148
+
+       ldr     r12,[sp,#16+0]  @ c.lo
+#ifdef __thumb2__
+       it      eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       orreq   r14,r14,#1
+       @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+       @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+       @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+       mov     r9,r5,lsr#28
+       mov     r10,r6,lsr#28
+       eor     r9,r9,r6,lsl#4
+       eor     r10,r10,r5,lsl#4
+       eor     r9,r9,r6,lsr#2
+       eor     r10,r10,r5,lsr#2
+       eor     r9,r9,r5,lsl#30
+       eor     r10,r10,r6,lsl#30
+       eor     r9,r9,r6,lsr#7
+       eor     r10,r10,r5,lsr#7
+       eor     r9,r9,r5,lsl#25
+       eor     r10,r10,r6,lsl#25       @ Sigma0(a)
+       adds    r3,r3,r9
+       and     r9,r5,r11
+       adc     r4,r4,r10               @ T += Sigma0(a)
+
+       ldr     r10,[sp,#8+4]   @ b.hi
+       orr     r5,r5,r11
+       ldr     r11,[sp,#16+4]  @ c.hi
+       and     r5,r5,r12
+       and     r12,r6,r10
+       orr     r6,r6,r10
+       orr     r5,r5,r9                @ Maj(a,b,c).lo
+       and     r6,r6,r11
+       adds    r5,r5,r3
+       orr     r6,r6,r12               @ Maj(a,b,c).hi
+       sub     sp,sp,#8
+       adc     r6,r6,r4                @ h += T
+       tst     r14,#1
+       add     r14,r14,#8
+       tst     r14,#1
+       beq     .L00_15
+       ldr     r9,[sp,#184+0]
+       ldr     r10,[sp,#184+4]
+       bic     r14,r14,#1
+.L16_79:
+       @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
+       @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
+       @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
+       mov     r3,r9,lsr#1
+       ldr     r11,[sp,#80+0]
+       mov     r4,r10,lsr#1
+       ldr     r12,[sp,#80+4]
+       eor     r3,r3,r10,lsl#31
+       eor     r4,r4,r9,lsl#31
+       eor     r3,r3,r9,lsr#8
+       eor     r4,r4,r10,lsr#8
+       eor     r3,r3,r10,lsl#24
+       eor     r4,r4,r9,lsl#24
+       eor     r3,r3,r9,lsr#7
+       eor     r4,r4,r10,lsr#7
+       eor     r3,r3,r10,lsl#25
+
+       @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
+       @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
+       @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
+       mov     r9,r11,lsr#19
+       mov     r10,r12,lsr#19
+       eor     r9,r9,r12,lsl#13
+       eor     r10,r10,r11,lsl#13
+       eor     r9,r9,r12,lsr#29
+       eor     r10,r10,r11,lsr#29
+       eor     r9,r9,r11,lsl#3
+       eor     r10,r10,r12,lsl#3
+       eor     r9,r9,r11,lsr#6
+       eor     r10,r10,r12,lsr#6
+       ldr     r11,[sp,#120+0]
+       eor     r9,r9,r12,lsl#26
+
+       ldr     r12,[sp,#120+4]
+       adds    r3,r3,r9
+       ldr     r9,[sp,#192+0]
+       adc     r4,r4,r10
+
+       ldr     r10,[sp,#192+4]
+       adds    r3,r3,r11
+       adc     r4,r4,r12
+       adds    r3,r3,r9
+       adc     r4,r4,r10
+       @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
+       @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
+       @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
+       mov     r9,r7,lsr#14
+       str     r3,[sp,#64+0]
+       mov     r10,r8,lsr#14
+       str     r4,[sp,#64+4]
+       eor     r9,r9,r8,lsl#18
+       ldr     r11,[sp,#56+0]  @ h.lo
+       eor     r10,r10,r7,lsl#18
+       ldr     r12,[sp,#56+4]  @ h.hi
+       eor     r9,r9,r7,lsr#18
+       eor     r10,r10,r8,lsr#18
+       eor     r9,r9,r8,lsl#14
+       eor     r10,r10,r7,lsl#14
+       eor     r9,r9,r8,lsr#9
+       eor     r10,r10,r7,lsr#9
+       eor     r9,r9,r7,lsl#23
+       eor     r10,r10,r8,lsl#23       @ Sigma1(e)
+       adds    r3,r3,r9
+       ldr     r9,[sp,#40+0]   @ f.lo
+       adc     r4,r4,r10               @ T += Sigma1(e)
+       ldr     r10,[sp,#40+4]  @ f.hi
+       adds    r3,r3,r11
+       ldr     r11,[sp,#48+0]  @ g.lo
+       adc     r4,r4,r12               @ T += h
+       ldr     r12,[sp,#48+4]  @ g.hi
+
+       eor     r9,r9,r11
+       str     r7,[sp,#32+0]
+       eor     r10,r10,r12
+       str     r8,[sp,#32+4]
+       and     r9,r9,r7
+       str     r5,[sp,#0+0]
+       and     r10,r10,r8
+       str     r6,[sp,#0+4]
+       eor     r9,r9,r11
+       ldr     r11,[r14,#LO]   @ K[i].lo
+       eor     r10,r10,r12             @ Ch(e,f,g)
+       ldr     r12,[r14,#HI]   @ K[i].hi
+
+       adds    r3,r3,r9
+       ldr     r7,[sp,#24+0]   @ d.lo
+       adc     r4,r4,r10               @ T += Ch(e,f,g)
+       ldr     r8,[sp,#24+4]   @ d.hi
+       adds    r3,r3,r11
+       and     r9,r11,#0xff
+       adc     r4,r4,r12               @ T += K[i]
+       adds    r7,r7,r3
+       ldr     r11,[sp,#8+0]   @ b.lo
+       adc     r8,r8,r4                @ d += T
+       teq     r9,#23
+
+       ldr     r12,[sp,#16+0]  @ c.lo
+#ifdef __thumb2__
+       it      eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       orreq   r14,r14,#1
+       @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
+       @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
+       @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
+       mov     r9,r5,lsr#28
+       mov     r10,r6,lsr#28
+       eor     r9,r9,r6,lsl#4
+       eor     r10,r10,r5,lsl#4
+       eor     r9,r9,r6,lsr#2
+       eor     r10,r10,r5,lsr#2
+       eor     r9,r9,r5,lsl#30
+       eor     r10,r10,r6,lsl#30
+       eor     r9,r9,r6,lsr#7
+       eor     r10,r10,r5,lsr#7
+       eor     r9,r9,r5,lsl#25
+       eor     r10,r10,r6,lsl#25       @ Sigma0(a)
+       adds    r3,r3,r9
+       and     r9,r5,r11
+       adc     r4,r4,r10               @ T += Sigma0(a)
+
+       ldr     r10,[sp,#8+4]   @ b.hi
+       orr     r5,r5,r11
+       ldr     r11,[sp,#16+4]  @ c.hi
+       and     r5,r5,r12
+       and     r12,r6,r10
+       orr     r6,r6,r10
+       orr     r5,r5,r9                @ Maj(a,b,c).lo
+       and     r6,r6,r11
+       adds    r5,r5,r3
+       orr     r6,r6,r12               @ Maj(a,b,c).hi
+       sub     sp,sp,#8
+       adc     r6,r6,r4                @ h += T
+       tst     r14,#1
+       add     r14,r14,#8
+#ifdef __thumb2__
+       ittt    eq                      @ Thumb2 thing, sanity check in ARM
+#endif
+       ldreq   r9,[sp,#184+0]
+       ldreq   r10,[sp,#184+4]
+       beq     .L16_79
+       bic     r14,r14,#1
+
+       ldr     r3,[sp,#8+0]
+       ldr     r4,[sp,#8+4]
+       ldr     r9, [r0,#0+LO]
+       ldr     r10, [r0,#0+HI]
+       ldr     r11, [r0,#8+LO]
+       ldr     r12, [r0,#8+HI]
+       adds    r9,r5,r9
+       str     r9, [r0,#0+LO]
+       adc     r10,r6,r10
+       str     r10, [r0,#0+HI]
+       adds    r11,r3,r11
+       str     r11, [r0,#8+LO]
+       adc     r12,r4,r12
+       str     r12, [r0,#8+HI]
+
+       ldr     r5,[sp,#16+0]
+       ldr     r6,[sp,#16+4]
+       ldr     r3,[sp,#24+0]
+       ldr     r4,[sp,#24+4]
+       ldr     r9, [r0,#16+LO]
+       ldr     r10, [r0,#16+HI]
+       ldr     r11, [r0,#24+LO]
+       ldr     r12, [r0,#24+HI]
+       adds    r9,r5,r9
+       str     r9, [r0,#16+LO]
+       adc     r10,r6,r10
+       str     r10, [r0,#16+HI]
+       adds    r11,r3,r11
+       str     r11, [r0,#24+LO]
+       adc     r12,r4,r12
+       str     r12, [r0,#24+HI]
+
+       ldr     r3,[sp,#40+0]
+       ldr     r4,[sp,#40+4]
+       ldr     r9, [r0,#32+LO]
+       ldr     r10, [r0,#32+HI]
+       ldr     r11, [r0,#40+LO]
+       ldr     r12, [r0,#40+HI]
+       adds    r7,r7,r9
+       str     r7,[r0,#32+LO]
+       adc     r8,r8,r10
+       str     r8,[r0,#32+HI]
+       adds    r11,r3,r11
+       str     r11, [r0,#40+LO]
+       adc     r12,r4,r12
+       str     r12, [r0,#40+HI]
+
+       ldr     r5,[sp,#48+0]
+       ldr     r6,[sp,#48+4]
+       ldr     r3,[sp,#56+0]
+       ldr     r4,[sp,#56+4]
+       ldr     r9, [r0,#48+LO]
+       ldr     r10, [r0,#48+HI]
+       ldr     r11, [r0,#56+LO]
+       ldr     r12, [r0,#56+HI]
+       adds    r9,r5,r9
+       str     r9, [r0,#48+LO]
+       adc     r10,r6,r10
+       str     r10, [r0,#48+HI]
+       adds    r11,r3,r11
+       str     r11, [r0,#56+LO]
+       adc     r12,r4,r12
+       str     r12, [r0,#56+HI]
+
+       add     sp,sp,#640
+       sub     r14,r14,#640
+
+       teq     r1,r2
+       bne     .Loop
+
+       add     sp,sp,#8*9              @ destroy frame
+
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
+#else
+       ldmia   sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+.word  0xe12fff1e                      @ interoperable with Thumb ISA:-)
+#endif
+.size  zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
+
+.arch  armv7-a
+.fpu   neon
+
+.globl zfs_sha512_block_neon
+.type  zfs_sha512_block_neon,%function
+.align 4
+zfs_sha512_block_neon:
+.LNEON:
+       dmb     @ errata #451034 on early Cortex A8
+       add     r2,r1,r2,lsl#7  @ len to point at the end of inp
+       adr     r3,K512
+       VFP_ABI_PUSH
+       vldmia  r0,{d16,d17,d18,d19,d20,d21,d22,d23}            @ load context
+.Loop_neon:
+       vshr.u64        d24,d20,#14     @ 0
+#if 0<16
+       vld1.64 {d0},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d20,#18
+#if 0>0
+       vadd.i64        d16,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d20,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d20,#50
+       vsli.64 d25,d20,#46
+       vmov    d29,d20
+       vsli.64 d26,d20,#23
+#if 0<16 && defined(__ARMEL__)
+       vrev64.8        d0,d0
+#endif
+       veor    d25,d24
+       vbsl    d29,d21,d22             @ Ch(e,f,g)
+       vshr.u64        d24,d16,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d23
+       vshr.u64        d25,d16,#34
+       vsli.64 d24,d16,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d16,#39
+       vadd.i64        d28,d0
+       vsli.64 d25,d16,#30
+       veor    d30,d16,d17
+       vsli.64 d26,d16,#25
+       veor    d23,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d18,d17             @ Maj(a,b,c)
+       veor    d23,d26                 @ Sigma0(a)
+       vadd.i64        d19,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d23,d30
+       vshr.u64        d24,d19,#14     @ 1
+#if 1<16
+       vld1.64 {d1},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d19,#18
+#if 1>0
+       vadd.i64        d23,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d19,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d19,#50
+       vsli.64 d25,d19,#46
+       vmov    d29,d19
+       vsli.64 d26,d19,#23
+#if 1<16 && defined(__ARMEL__)
+       vrev64.8        d1,d1
+#endif
+       veor    d25,d24
+       vbsl    d29,d20,d21             @ Ch(e,f,g)
+       vshr.u64        d24,d23,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d22
+       vshr.u64        d25,d23,#34
+       vsli.64 d24,d23,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d23,#39
+       vadd.i64        d28,d1
+       vsli.64 d25,d23,#30
+       veor    d30,d23,d16
+       vsli.64 d26,d23,#25
+       veor    d22,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d17,d16             @ Maj(a,b,c)
+       veor    d22,d26                 @ Sigma0(a)
+       vadd.i64        d18,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d22,d30
+       vshr.u64        d24,d18,#14     @ 2
+#if 2<16
+       vld1.64 {d2},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d18,#18
+#if 2>0
+       vadd.i64        d22,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d18,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d18,#50
+       vsli.64 d25,d18,#46
+       vmov    d29,d18
+       vsli.64 d26,d18,#23
+#if 2<16 && defined(__ARMEL__)
+       vrev64.8        d2,d2
+#endif
+       veor    d25,d24
+       vbsl    d29,d19,d20             @ Ch(e,f,g)
+       vshr.u64        d24,d22,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d21
+       vshr.u64        d25,d22,#34
+       vsli.64 d24,d22,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d22,#39
+       vadd.i64        d28,d2
+       vsli.64 d25,d22,#30
+       veor    d30,d22,d23
+       vsli.64 d26,d22,#25
+       veor    d21,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d16,d23             @ Maj(a,b,c)
+       veor    d21,d26                 @ Sigma0(a)
+       vadd.i64        d17,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d21,d30
+       vshr.u64        d24,d17,#14     @ 3
+#if 3<16
+       vld1.64 {d3},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d17,#18
+#if 3>0
+       vadd.i64        d21,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d17,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d17,#50
+       vsli.64 d25,d17,#46
+       vmov    d29,d17
+       vsli.64 d26,d17,#23
+#if 3<16 && defined(__ARMEL__)
+       vrev64.8        d3,d3
+#endif
+       veor    d25,d24
+       vbsl    d29,d18,d19             @ Ch(e,f,g)
+       vshr.u64        d24,d21,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d20
+       vshr.u64        d25,d21,#34
+       vsli.64 d24,d21,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d21,#39
+       vadd.i64        d28,d3
+       vsli.64 d25,d21,#30
+       veor    d30,d21,d22
+       vsli.64 d26,d21,#25
+       veor    d20,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d23,d22             @ Maj(a,b,c)
+       veor    d20,d26                 @ Sigma0(a)
+       vadd.i64        d16,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d20,d30
+       vshr.u64        d24,d16,#14     @ 4
+#if 4<16
+       vld1.64 {d4},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d16,#18
+#if 4>0
+       vadd.i64        d20,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d16,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d16,#50
+       vsli.64 d25,d16,#46
+       vmov    d29,d16
+       vsli.64 d26,d16,#23
+#if 4<16 && defined(__ARMEL__)
+       vrev64.8        d4,d4
+#endif
+       veor    d25,d24
+       vbsl    d29,d17,d18             @ Ch(e,f,g)
+       vshr.u64        d24,d20,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d19
+       vshr.u64        d25,d20,#34
+       vsli.64 d24,d20,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d20,#39
+       vadd.i64        d28,d4
+       vsli.64 d25,d20,#30
+       veor    d30,d20,d21
+       vsli.64 d26,d20,#25
+       veor    d19,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d22,d21             @ Maj(a,b,c)
+       veor    d19,d26                 @ Sigma0(a)
+       vadd.i64        d23,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d19,d30
+       vshr.u64        d24,d23,#14     @ 5
+#if 5<16
+       vld1.64 {d5},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d23,#18
+#if 5>0
+       vadd.i64        d19,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d23,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d23,#50
+       vsli.64 d25,d23,#46
+       vmov    d29,d23
+       vsli.64 d26,d23,#23
+#if 5<16 && defined(__ARMEL__)
+       vrev64.8        d5,d5
+#endif
+       veor    d25,d24
+       vbsl    d29,d16,d17             @ Ch(e,f,g)
+       vshr.u64        d24,d19,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d18
+       vshr.u64        d25,d19,#34
+       vsli.64 d24,d19,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d19,#39
+       vadd.i64        d28,d5
+       vsli.64 d25,d19,#30
+       veor    d30,d19,d20
+       vsli.64 d26,d19,#25
+       veor    d18,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d21,d20             @ Maj(a,b,c)
+       veor    d18,d26                 @ Sigma0(a)
+       vadd.i64        d22,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d18,d30
+       vshr.u64        d24,d22,#14     @ 6
+#if 6<16
+       vld1.64 {d6},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d22,#18
+#if 6>0
+       vadd.i64        d18,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d22,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d22,#50
+       vsli.64 d25,d22,#46
+       vmov    d29,d22
+       vsli.64 d26,d22,#23
+#if 6<16 && defined(__ARMEL__)
+       vrev64.8        d6,d6
+#endif
+       veor    d25,d24
+       vbsl    d29,d23,d16             @ Ch(e,f,g)
+       vshr.u64        d24,d18,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d17
+       vshr.u64        d25,d18,#34
+       vsli.64 d24,d18,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d18,#39
+       vadd.i64        d28,d6
+       vsli.64 d25,d18,#30
+       veor    d30,d18,d19
+       vsli.64 d26,d18,#25
+       veor    d17,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d20,d19             @ Maj(a,b,c)
+       veor    d17,d26                 @ Sigma0(a)
+       vadd.i64        d21,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d17,d30
+       vshr.u64        d24,d21,#14     @ 7
+#if 7<16
+       vld1.64 {d7},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d21,#18
+#if 7>0
+       vadd.i64        d17,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d21,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d21,#50
+       vsli.64 d25,d21,#46
+       vmov    d29,d21
+       vsli.64 d26,d21,#23
+#if 7<16 && defined(__ARMEL__)
+       vrev64.8        d7,d7
+#endif
+       veor    d25,d24
+       vbsl    d29,d22,d23             @ Ch(e,f,g)
+       vshr.u64        d24,d17,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d16
+       vshr.u64        d25,d17,#34
+       vsli.64 d24,d17,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d17,#39
+       vadd.i64        d28,d7
+       vsli.64 d25,d17,#30
+       veor    d30,d17,d18
+       vsli.64 d26,d17,#25
+       veor    d16,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d19,d18             @ Maj(a,b,c)
+       veor    d16,d26                 @ Sigma0(a)
+       vadd.i64        d20,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d16,d30
+       vshr.u64        d24,d20,#14     @ 8
+#if 8<16
+       vld1.64 {d8},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d20,#18
+#if 8>0
+       vadd.i64        d16,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d20,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d20,#50
+       vsli.64 d25,d20,#46
+       vmov    d29,d20
+       vsli.64 d26,d20,#23
+#if 8<16 && defined(__ARMEL__)
+       vrev64.8        d8,d8
+#endif
+       veor    d25,d24
+       vbsl    d29,d21,d22             @ Ch(e,f,g)
+       vshr.u64        d24,d16,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d23
+       vshr.u64        d25,d16,#34
+       vsli.64 d24,d16,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d16,#39
+       vadd.i64        d28,d8
+       vsli.64 d25,d16,#30
+       veor    d30,d16,d17
+       vsli.64 d26,d16,#25
+       veor    d23,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d18,d17             @ Maj(a,b,c)
+       veor    d23,d26                 @ Sigma0(a)
+       vadd.i64        d19,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d23,d30
+       vshr.u64        d24,d19,#14     @ 9
+#if 9<16
+       vld1.64 {d9},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d19,#18
+#if 9>0
+       vadd.i64        d23,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d19,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d19,#50
+       vsli.64 d25,d19,#46
+       vmov    d29,d19
+       vsli.64 d26,d19,#23
+#if 9<16 && defined(__ARMEL__)
+       vrev64.8        d9,d9
+#endif
+       veor    d25,d24
+       vbsl    d29,d20,d21             @ Ch(e,f,g)
+       vshr.u64        d24,d23,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d22
+       vshr.u64        d25,d23,#34
+       vsli.64 d24,d23,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d23,#39
+       vadd.i64        d28,d9
+       vsli.64 d25,d23,#30
+       veor    d30,d23,d16
+       vsli.64 d26,d23,#25
+       veor    d22,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d17,d16             @ Maj(a,b,c)
+       veor    d22,d26                 @ Sigma0(a)
+       vadd.i64        d18,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d22,d30
+       vshr.u64        d24,d18,#14     @ 10
+#if 10<16
+       vld1.64 {d10},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d18,#18
+#if 10>0
+       vadd.i64        d22,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d18,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d18,#50
+       vsli.64 d25,d18,#46
+       vmov    d29,d18
+       vsli.64 d26,d18,#23
+#if 10<16 && defined(__ARMEL__)
+       vrev64.8        d10,d10
+#endif
+       veor    d25,d24
+       vbsl    d29,d19,d20             @ Ch(e,f,g)
+       vshr.u64        d24,d22,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d21
+       vshr.u64        d25,d22,#34
+       vsli.64 d24,d22,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d22,#39
+       vadd.i64        d28,d10
+       vsli.64 d25,d22,#30
+       veor    d30,d22,d23
+       vsli.64 d26,d22,#25
+       veor    d21,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d16,d23             @ Maj(a,b,c)
+       veor    d21,d26                 @ Sigma0(a)
+       vadd.i64        d17,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d21,d30
+       vshr.u64        d24,d17,#14     @ 11
+#if 11<16
+       vld1.64 {d11},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d17,#18
+#if 11>0
+       vadd.i64        d21,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d17,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d17,#50
+       vsli.64 d25,d17,#46
+       vmov    d29,d17
+       vsli.64 d26,d17,#23
+#if 11<16 && defined(__ARMEL__)
+       vrev64.8        d11,d11
+#endif
+       veor    d25,d24
+       vbsl    d29,d18,d19             @ Ch(e,f,g)
+       vshr.u64        d24,d21,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d20
+       vshr.u64        d25,d21,#34
+       vsli.64 d24,d21,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d21,#39
+       vadd.i64        d28,d11
+       vsli.64 d25,d21,#30
+       veor    d30,d21,d22
+       vsli.64 d26,d21,#25
+       veor    d20,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d23,d22             @ Maj(a,b,c)
+       veor    d20,d26                 @ Sigma0(a)
+       vadd.i64        d16,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d20,d30
+       vshr.u64        d24,d16,#14     @ 12
+#if 12<16
+       vld1.64 {d12},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d16,#18
+#if 12>0
+       vadd.i64        d20,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d16,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d16,#50
+       vsli.64 d25,d16,#46
+       vmov    d29,d16
+       vsli.64 d26,d16,#23
+#if 12<16 && defined(__ARMEL__)
+       vrev64.8        d12,d12
+#endif
+       veor    d25,d24
+       vbsl    d29,d17,d18             @ Ch(e,f,g)
+       vshr.u64        d24,d20,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d19
+       vshr.u64        d25,d20,#34
+       vsli.64 d24,d20,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d20,#39
+       vadd.i64        d28,d12
+       vsli.64 d25,d20,#30
+       veor    d30,d20,d21
+       vsli.64 d26,d20,#25
+       veor    d19,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d22,d21             @ Maj(a,b,c)
+       veor    d19,d26                 @ Sigma0(a)
+       vadd.i64        d23,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d19,d30
+       vshr.u64        d24,d23,#14     @ 13
+#if 13<16
+       vld1.64 {d13},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d23,#18
+#if 13>0
+       vadd.i64        d19,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d23,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d23,#50
+       vsli.64 d25,d23,#46
+       vmov    d29,d23
+       vsli.64 d26,d23,#23
+#if 13<16 && defined(__ARMEL__)
+       vrev64.8        d13,d13
+#endif
+       veor    d25,d24
+       vbsl    d29,d16,d17             @ Ch(e,f,g)
+       vshr.u64        d24,d19,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d18
+       vshr.u64        d25,d19,#34
+       vsli.64 d24,d19,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d19,#39
+       vadd.i64        d28,d13
+       vsli.64 d25,d19,#30
+       veor    d30,d19,d20
+       vsli.64 d26,d19,#25
+       veor    d18,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d21,d20             @ Maj(a,b,c)
+       veor    d18,d26                 @ Sigma0(a)
+       vadd.i64        d22,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d18,d30
+       vshr.u64        d24,d22,#14     @ 14
+#if 14<16
+       vld1.64 {d14},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d22,#18
+#if 14>0
+       vadd.i64        d18,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d22,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d22,#50
+       vsli.64 d25,d22,#46
+       vmov    d29,d22
+       vsli.64 d26,d22,#23
+#if 14<16 && defined(__ARMEL__)
+       vrev64.8        d14,d14
+#endif
+       veor    d25,d24
+       vbsl    d29,d23,d16             @ Ch(e,f,g)
+       vshr.u64        d24,d18,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d17
+       vshr.u64        d25,d18,#34
+       vsli.64 d24,d18,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d18,#39
+       vadd.i64        d28,d14
+       vsli.64 d25,d18,#30
+       veor    d30,d18,d19
+       vsli.64 d26,d18,#25
+       veor    d17,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d20,d19             @ Maj(a,b,c)
+       veor    d17,d26                 @ Sigma0(a)
+       vadd.i64        d21,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d17,d30
+       vshr.u64        d24,d21,#14     @ 15
+#if 15<16
+       vld1.64 {d15},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d21,#18
+#if 15>0
+       vadd.i64        d17,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d21,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d21,#50
+       vsli.64 d25,d21,#46
+       vmov    d29,d21
+       vsli.64 d26,d21,#23
+#if 15<16 && defined(__ARMEL__)
+       vrev64.8        d15,d15
+#endif
+       veor    d25,d24
+       vbsl    d29,d22,d23             @ Ch(e,f,g)
+       vshr.u64        d24,d17,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d16
+       vshr.u64        d25,d17,#34
+       vsli.64 d24,d17,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d17,#39
+       vadd.i64        d28,d15
+       vsli.64 d25,d17,#30
+       veor    d30,d17,d18
+       vsli.64 d26,d17,#25
+       veor    d16,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d19,d18             @ Maj(a,b,c)
+       veor    d16,d26                 @ Sigma0(a)
+       vadd.i64        d20,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d16,d30
+       mov     r12,#4
+.L16_79_neon:
+       subs    r12,#1
+       vshr.u64        q12,q7,#19
+       vshr.u64        q13,q7,#61
+       vadd.i64        d16,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q7,#6
+       vsli.64 q12,q7,#45
+       vext.8  q14,q0,q1,#8    @ X[i+1]
+       vsli.64 q13,q7,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q0,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q4,q5,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d20,#14             @ from NEON_00_15
+       vadd.i64        q0,q14
+       vshr.u64        d25,d20,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d20,#41             @ from NEON_00_15
+       vadd.i64        q0,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d20,#50
+       vsli.64 d25,d20,#46
+       vmov    d29,d20
+       vsli.64 d26,d20,#23
+#if 16<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d21,d22             @ Ch(e,f,g)
+       vshr.u64        d24,d16,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d23
+       vshr.u64        d25,d16,#34
+       vsli.64 d24,d16,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d16,#39
+       vadd.i64        d28,d0
+       vsli.64 d25,d16,#30
+       veor    d30,d16,d17
+       vsli.64 d26,d16,#25
+       veor    d23,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d18,d17             @ Maj(a,b,c)
+       veor    d23,d26                 @ Sigma0(a)
+       vadd.i64        d19,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d23,d30
+       vshr.u64        d24,d19,#14     @ 17
+#if 17<16
+       vld1.64 {d1},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d19,#18
+#if 17>0
+       vadd.i64        d23,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d19,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d19,#50
+       vsli.64 d25,d19,#46
+       vmov    d29,d19
+       vsli.64 d26,d19,#23
+#if 17<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d20,d21             @ Ch(e,f,g)
+       vshr.u64        d24,d23,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d22
+       vshr.u64        d25,d23,#34
+       vsli.64 d24,d23,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d23,#39
+       vadd.i64        d28,d1
+       vsli.64 d25,d23,#30
+       veor    d30,d23,d16
+       vsli.64 d26,d23,#25
+       veor    d22,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d17,d16             @ Maj(a,b,c)
+       veor    d22,d26                 @ Sigma0(a)
+       vadd.i64        d18,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d22,d30
+       vshr.u64        q12,q0,#19
+       vshr.u64        q13,q0,#61
+       vadd.i64        d22,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q0,#6
+       vsli.64 q12,q0,#45
+       vext.8  q14,q1,q2,#8    @ X[i+1]
+       vsli.64 q13,q0,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q1,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q5,q6,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d18,#14             @ from NEON_00_15
+       vadd.i64        q1,q14
+       vshr.u64        d25,d18,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d18,#41             @ from NEON_00_15
+       vadd.i64        q1,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d18,#50
+       vsli.64 d25,d18,#46
+       vmov    d29,d18
+       vsli.64 d26,d18,#23
+#if 18<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d19,d20             @ Ch(e,f,g)
+       vshr.u64        d24,d22,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d21
+       vshr.u64        d25,d22,#34
+       vsli.64 d24,d22,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d22,#39
+       vadd.i64        d28,d2
+       vsli.64 d25,d22,#30
+       veor    d30,d22,d23
+       vsli.64 d26,d22,#25
+       veor    d21,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d16,d23             @ Maj(a,b,c)
+       veor    d21,d26                 @ Sigma0(a)
+       vadd.i64        d17,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d21,d30
+       vshr.u64        d24,d17,#14     @ 19
+#if 19<16
+       vld1.64 {d3},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d17,#18
+#if 19>0
+       vadd.i64        d21,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d17,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d17,#50
+       vsli.64 d25,d17,#46
+       vmov    d29,d17
+       vsli.64 d26,d17,#23
+#if 19<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d18,d19             @ Ch(e,f,g)
+       vshr.u64        d24,d21,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d20
+       vshr.u64        d25,d21,#34
+       vsli.64 d24,d21,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d21,#39
+       vadd.i64        d28,d3
+       vsli.64 d25,d21,#30
+       veor    d30,d21,d22
+       vsli.64 d26,d21,#25
+       veor    d20,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d23,d22             @ Maj(a,b,c)
+       veor    d20,d26                 @ Sigma0(a)
+       vadd.i64        d16,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d20,d30
+       vshr.u64        q12,q1,#19
+       vshr.u64        q13,q1,#61
+       vadd.i64        d20,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q1,#6
+       vsli.64 q12,q1,#45
+       vext.8  q14,q2,q3,#8    @ X[i+1]
+       vsli.64 q13,q1,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q2,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q6,q7,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d16,#14             @ from NEON_00_15
+       vadd.i64        q2,q14
+       vshr.u64        d25,d16,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d16,#41             @ from NEON_00_15
+       vadd.i64        q2,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d16,#50
+       vsli.64 d25,d16,#46
+       vmov    d29,d16
+       vsli.64 d26,d16,#23
+#if 20<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d17,d18             @ Ch(e,f,g)
+       vshr.u64        d24,d20,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d19
+       vshr.u64        d25,d20,#34
+       vsli.64 d24,d20,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d20,#39
+       vadd.i64        d28,d4
+       vsli.64 d25,d20,#30
+       veor    d30,d20,d21
+       vsli.64 d26,d20,#25
+       veor    d19,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d22,d21             @ Maj(a,b,c)
+       veor    d19,d26                 @ Sigma0(a)
+       vadd.i64        d23,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d19,d30
+       vshr.u64        d24,d23,#14     @ 21
+#if 21<16
+       vld1.64 {d5},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d23,#18
+#if 21>0
+       vadd.i64        d19,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d23,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d23,#50
+       vsli.64 d25,d23,#46
+       vmov    d29,d23
+       vsli.64 d26,d23,#23
+#if 21<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d16,d17             @ Ch(e,f,g)
+       vshr.u64        d24,d19,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d18
+       vshr.u64        d25,d19,#34
+       vsli.64 d24,d19,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d19,#39
+       vadd.i64        d28,d5
+       vsli.64 d25,d19,#30
+       veor    d30,d19,d20
+       vsli.64 d26,d19,#25
+       veor    d18,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d21,d20             @ Maj(a,b,c)
+       veor    d18,d26                 @ Sigma0(a)
+       vadd.i64        d22,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d18,d30
+       vshr.u64        q12,q2,#19
+       vshr.u64        q13,q2,#61
+       vadd.i64        d18,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q2,#6
+       vsli.64 q12,q2,#45
+       vext.8  q14,q3,q4,#8    @ X[i+1]
+       vsli.64 q13,q2,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q3,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q7,q0,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d22,#14             @ from NEON_00_15
+       vadd.i64        q3,q14
+       vshr.u64        d25,d22,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d22,#41             @ from NEON_00_15
+       vadd.i64        q3,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d22,#50
+       vsli.64 d25,d22,#46
+       vmov    d29,d22
+       vsli.64 d26,d22,#23
+#if 22<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d23,d16             @ Ch(e,f,g)
+       vshr.u64        d24,d18,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d17
+       vshr.u64        d25,d18,#34
+       vsli.64 d24,d18,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d18,#39
+       vadd.i64        d28,d6
+       vsli.64 d25,d18,#30
+       veor    d30,d18,d19
+       vsli.64 d26,d18,#25
+       veor    d17,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d20,d19             @ Maj(a,b,c)
+       veor    d17,d26                 @ Sigma0(a)
+       vadd.i64        d21,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d17,d30
+       vshr.u64        d24,d21,#14     @ 23
+#if 23<16
+       vld1.64 {d7},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d21,#18
+#if 23>0
+       vadd.i64        d17,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d21,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d21,#50
+       vsli.64 d25,d21,#46
+       vmov    d29,d21
+       vsli.64 d26,d21,#23
+#if 23<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d22,d23             @ Ch(e,f,g)
+       vshr.u64        d24,d17,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d16
+       vshr.u64        d25,d17,#34
+       vsli.64 d24,d17,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d17,#39
+       vadd.i64        d28,d7
+       vsli.64 d25,d17,#30
+       veor    d30,d17,d18
+       vsli.64 d26,d17,#25
+       veor    d16,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d19,d18             @ Maj(a,b,c)
+       veor    d16,d26                 @ Sigma0(a)
+       vadd.i64        d20,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d16,d30
+       vshr.u64        q12,q3,#19
+       vshr.u64        q13,q3,#61
+       vadd.i64        d16,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q3,#6
+       vsli.64 q12,q3,#45
+       vext.8  q14,q4,q5,#8    @ X[i+1]
+       vsli.64 q13,q3,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q4,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q0,q1,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d20,#14             @ from NEON_00_15
+       vadd.i64        q4,q14
+       vshr.u64        d25,d20,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d20,#41             @ from NEON_00_15
+       vadd.i64        q4,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d20,#50
+       vsli.64 d25,d20,#46
+       vmov    d29,d20
+       vsli.64 d26,d20,#23
+#if 24<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d21,d22             @ Ch(e,f,g)
+       vshr.u64        d24,d16,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d23
+       vshr.u64        d25,d16,#34
+       vsli.64 d24,d16,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d16,#39
+       vadd.i64        d28,d8
+       vsli.64 d25,d16,#30
+       veor    d30,d16,d17
+       vsli.64 d26,d16,#25
+       veor    d23,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d18,d17             @ Maj(a,b,c)
+       veor    d23,d26                 @ Sigma0(a)
+       vadd.i64        d19,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d23,d30
+       vshr.u64        d24,d19,#14     @ 25
+#if 25<16
+       vld1.64 {d9},[r1]!      @ handles unaligned
+#endif
+       vshr.u64        d25,d19,#18
+#if 25>0
+       vadd.i64        d23,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d19,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d19,#50
+       vsli.64 d25,d19,#46
+       vmov    d29,d19
+       vsli.64 d26,d19,#23
+#if 25<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d20,d21             @ Ch(e,f,g)
+       vshr.u64        d24,d23,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d22
+       vshr.u64        d25,d23,#34
+       vsli.64 d24,d23,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d23,#39
+       vadd.i64        d28,d9
+       vsli.64 d25,d23,#30
+       veor    d30,d23,d16
+       vsli.64 d26,d23,#25
+       veor    d22,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d17,d16             @ Maj(a,b,c)
+       veor    d22,d26                 @ Sigma0(a)
+       vadd.i64        d18,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d22,d30
+       vshr.u64        q12,q4,#19
+       vshr.u64        q13,q4,#61
+       vadd.i64        d22,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q4,#6
+       vsli.64 q12,q4,#45
+       vext.8  q14,q5,q6,#8    @ X[i+1]
+       vsli.64 q13,q4,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q5,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q1,q2,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d18,#14             @ from NEON_00_15
+       vadd.i64        q5,q14
+       vshr.u64        d25,d18,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d18,#41             @ from NEON_00_15
+       vadd.i64        q5,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d18,#50
+       vsli.64 d25,d18,#46
+       vmov    d29,d18
+       vsli.64 d26,d18,#23
+#if 26<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d19,d20             @ Ch(e,f,g)
+       vshr.u64        d24,d22,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d21
+       vshr.u64        d25,d22,#34
+       vsli.64 d24,d22,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d22,#39
+       vadd.i64        d28,d10
+       vsli.64 d25,d22,#30
+       veor    d30,d22,d23
+       vsli.64 d26,d22,#25
+       veor    d21,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d16,d23             @ Maj(a,b,c)
+       veor    d21,d26                 @ Sigma0(a)
+       vadd.i64        d17,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d21,d30
+       vshr.u64        d24,d17,#14     @ 27
+#if 27<16
+       vld1.64 {d11},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d17,#18
+#if 27>0
+       vadd.i64        d21,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d17,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d17,#50
+       vsli.64 d25,d17,#46
+       vmov    d29,d17
+       vsli.64 d26,d17,#23
+#if 27<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d18,d19             @ Ch(e,f,g)
+       vshr.u64        d24,d21,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d20
+       vshr.u64        d25,d21,#34
+       vsli.64 d24,d21,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d21,#39
+       vadd.i64        d28,d11
+       vsli.64 d25,d21,#30
+       veor    d30,d21,d22
+       vsli.64 d26,d21,#25
+       veor    d20,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d23,d22             @ Maj(a,b,c)
+       veor    d20,d26                 @ Sigma0(a)
+       vadd.i64        d16,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d20,d30
+       vshr.u64        q12,q5,#19
+       vshr.u64        q13,q5,#61
+       vadd.i64        d20,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q5,#6
+       vsli.64 q12,q5,#45
+       vext.8  q14,q6,q7,#8    @ X[i+1]
+       vsli.64 q13,q5,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q6,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q2,q3,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d16,#14             @ from NEON_00_15
+       vadd.i64        q6,q14
+       vshr.u64        d25,d16,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d16,#41             @ from NEON_00_15
+       vadd.i64        q6,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d16,#50
+       vsli.64 d25,d16,#46
+       vmov    d29,d16
+       vsli.64 d26,d16,#23
+#if 28<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d17,d18             @ Ch(e,f,g)
+       vshr.u64        d24,d20,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d19
+       vshr.u64        d25,d20,#34
+       vsli.64 d24,d20,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d20,#39
+       vadd.i64        d28,d12
+       vsli.64 d25,d20,#30
+       veor    d30,d20,d21
+       vsli.64 d26,d20,#25
+       veor    d19,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d22,d21             @ Maj(a,b,c)
+       veor    d19,d26                 @ Sigma0(a)
+       vadd.i64        d23,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d19,d30
+       vshr.u64        d24,d23,#14     @ 29
+#if 29<16
+       vld1.64 {d13},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d23,#18
+#if 29>0
+       vadd.i64        d19,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d23,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d23,#50
+       vsli.64 d25,d23,#46
+       vmov    d29,d23
+       vsli.64 d26,d23,#23
+#if 29<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d16,d17             @ Ch(e,f,g)
+       vshr.u64        d24,d19,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d18
+       vshr.u64        d25,d19,#34
+       vsli.64 d24,d19,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d19,#39
+       vadd.i64        d28,d13
+       vsli.64 d25,d19,#30
+       veor    d30,d19,d20
+       vsli.64 d26,d19,#25
+       veor    d18,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d21,d20             @ Maj(a,b,c)
+       veor    d18,d26                 @ Sigma0(a)
+       vadd.i64        d22,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d18,d30
+       vshr.u64        q12,q6,#19
+       vshr.u64        q13,q6,#61
+       vadd.i64        d18,d30                 @ h+=Maj from the past
+       vshr.u64        q15,q6,#6
+       vsli.64 q12,q6,#45
+       vext.8  q14,q7,q0,#8    @ X[i+1]
+       vsli.64 q13,q6,#3
+       veor    q15,q12
+       vshr.u64        q12,q14,#1
+       veor    q15,q13                         @ sigma1(X[i+14])
+       vshr.u64        q13,q14,#8
+       vadd.i64        q7,q15
+       vshr.u64        q15,q14,#7
+       vsli.64 q12,q14,#63
+       vsli.64 q13,q14,#56
+       vext.8  q14,q3,q4,#8    @ X[i+9]
+       veor    q15,q12
+       vshr.u64        d24,d22,#14             @ from NEON_00_15
+       vadd.i64        q7,q14
+       vshr.u64        d25,d22,#18             @ from NEON_00_15
+       veor    q15,q13                         @ sigma0(X[i+1])
+       vshr.u64        d26,d22,#41             @ from NEON_00_15
+       vadd.i64        q7,q15
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d22,#50
+       vsli.64 d25,d22,#46
+       vmov    d29,d22
+       vsli.64 d26,d22,#23
+#if 30<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d23,d16             @ Ch(e,f,g)
+       vshr.u64        d24,d18,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d17
+       vshr.u64        d25,d18,#34
+       vsli.64 d24,d18,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d18,#39
+       vadd.i64        d28,d14
+       vsli.64 d25,d18,#30
+       veor    d30,d18,d19
+       vsli.64 d26,d18,#25
+       veor    d17,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d20,d19             @ Maj(a,b,c)
+       veor    d17,d26                 @ Sigma0(a)
+       vadd.i64        d21,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d17,d30
+       vshr.u64        d24,d21,#14     @ 31
+#if 31<16
+       vld1.64 {d15},[r1]!     @ handles unaligned
+#endif
+       vshr.u64        d25,d21,#18
+#if 31>0
+       vadd.i64        d17,d30                 @ h+=Maj from the past
+#endif
+       vshr.u64        d26,d21,#41
+       vld1.64 {d28},[r3,:64]! @ K[i++]
+       vsli.64 d24,d21,#50
+       vsli.64 d25,d21,#46
+       vmov    d29,d21
+       vsli.64 d26,d21,#23
+#if 31<16 && defined(__ARMEL__)
+       vrev64.8        ,
+#endif
+       veor    d25,d24
+       vbsl    d29,d22,d23             @ Ch(e,f,g)
+       vshr.u64        d24,d17,#28
+       veor    d26,d25                 @ Sigma1(e)
+       vadd.i64        d27,d29,d16
+       vshr.u64        d25,d17,#34
+       vsli.64 d24,d17,#36
+       vadd.i64        d27,d26
+       vshr.u64        d26,d17,#39
+       vadd.i64        d28,d15
+       vsli.64 d25,d17,#30
+       veor    d30,d17,d18
+       vsli.64 d26,d17,#25
+       veor    d16,d24,d25
+       vadd.i64        d27,d28
+       vbsl    d30,d19,d18             @ Maj(a,b,c)
+       veor    d16,d26                 @ Sigma0(a)
+       vadd.i64        d20,d27
+       vadd.i64        d30,d27
+       @ vadd.i64      d16,d30
+       bne     .L16_79_neon
+
+       vadd.i64        d16,d30         @ h+=Maj from the past
+       vldmia  r0,{d24,d25,d26,d27,d28,d29,d30,d31}    @ load context to temp
+       vadd.i64        q8,q12          @ vectorized accumulate
+       vadd.i64        q9,q13
+       vadd.i64        q10,q14
+       vadd.i64        q11,q15
+       vstmia  r0,{d16,d17,d18,d19,d20,d21,d22,d23}    @ save context
+       teq     r1,r2
+       sub     r3,#640 @ rewind K512
+       bne     .Loop_neon
+
+       VFP_ABI_POP
+       bx      lr                              @ .word 0xe12fff1e
+.size  zfs_sha512_block_neon,.-zfs_sha512_block_neon
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha256-p8.S b/module/icp/asm-ppc64/sha2/sha256-p8.S
new file mode 100644 (file)
index 0000000..6bbfe23
--- /dev/null
@@ -0,0 +1,1505 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha256_power8
+.globl .zfs_sha256_power8
+.type  zfs_sha256_power8,@function
+.section       ".opd","aw"
+.align 3
+zfs_sha256_power8:
+.quad  .zfs_sha256_power8,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha256_power8:
+       stdu    1,-384(1)
+       mflr    8
+       li      10,207
+       li      11,223
+       stvx    24,10,1
+       addi    10,10,32
+       mfspr   12,256
+       stvx    25,11,1
+       addi    11,11,32
+       stvx    26,10,1
+       addi    10,10,32
+       stvx    27,11,1
+       addi    11,11,32
+       stvx    28,10,1
+       addi    10,10,32
+       stvx    29,11,1
+       addi    11,11,32
+       stvx    30,10,1
+       stvx    31,11,1
+       li      11,-4096+255
+       stw     12,332(1)
+       li      10,0x10
+       std     26,336(1)
+       li      26,0x20
+       std     27,344(1)
+       li      27,0x30
+       std     28,352(1)
+       li      28,0x40
+       std     29,360(1)
+       li      29,0x50
+       std     30,368(1)
+       li      30,0x60
+       std     31,376(1)
+       li      31,0x70
+       std     8,400(1)
+       mtspr   256,11
+
+       bl      .LPICmeup
+       addi    11,1,79
+       .long   0x7C001E19
+       .long   0x7C8A1E19
+       vsldoi  1,0,0,4
+       vsldoi  2,0,0,8
+       vsldoi  3,0,0,12
+       vsldoi  5,4,4,4
+       vsldoi  6,4,4,8
+       vsldoi  7,4,4,12
+       li      0,3
+       b       .Loop
+.align 5
+.Loop:
+       lvx     28,0,6
+       .long   0x7D002699
+       addi    4,4,16
+       mr      7,6
+       stvx    0,0,11
+       stvx    1,10,11
+       stvx    2,26,11
+       stvx    3,27,11
+       stvx    4,28,11
+       stvx    5,29,11
+       stvx    6,30,11
+       stvx    7,31,11
+       vadduwm 7,7,28
+       lvx     28,10,6
+       vadduwm 7,7,8
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       vsldoi  9,8,8,4
+       vadduwm 6,6,9
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       vsldoi  10,9,9,4
+       vadduwm 5,5,10
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x7D802699
+       addi    4,4,16
+       vsldoi  11,10,10,4
+       vadduwm 4,4,11
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       vadduwm 3,3,12
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       vsldoi  13,12,12,4
+       vadduwm 2,2,13
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       vsldoi  14,13,13,4
+       vadduwm 1,1,14
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x7E002699
+       addi    4,4,16
+       vsldoi  15,14,14,4
+       vadduwm 0,0,15
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       vadduwm 7,7,16
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       vsldoi  17,16,16,4
+       vadduwm 6,6,17
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       vsldoi  18,17,17,4
+       vadduwm 5,5,18
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x7F002699
+       addi    4,4,16
+       vsldoi  19,18,18,4
+       vadduwm 4,4,19
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       vadduwm 3,3,24
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       vsldoi  25,24,24,4
+       vadduwm 2,2,25
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       vsldoi  26,25,25,4
+       vadduwm 1,1,26
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       vsldoi  27,26,26,4
+       .long   0x13C90682
+       vadduwm 8,8,30
+       .long   0x13DA7E82
+       vadduwm 8,8,30
+       vadduwm 8,8,17
+       vadduwm 0,0,27
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       mtctr   0
+       b       .L16_xx
+.align 5
+.L16_xx:
+       .long   0x13CA0682
+       vadduwm 9,9,30
+       .long   0x13DB7E82
+       vadduwm 9,9,30
+       vadduwm 9,9,18
+       vadduwm 7,7,8
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       .long   0x13CB0682
+       vadduwm 10,10,30
+       .long   0x13C87E82
+       vadduwm 10,10,30
+       vadduwm 10,10,19
+       vadduwm 6,6,9
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       .long   0x13CC0682
+       vadduwm 11,11,30
+       .long   0x13C97E82
+       vadduwm 11,11,30
+       vadduwm 11,11,24
+       vadduwm 5,5,10
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x13CD0682
+       vadduwm 12,12,30
+       .long   0x13CA7E82
+       vadduwm 12,12,30
+       vadduwm 12,12,25
+       vadduwm 4,4,11
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       .long   0x13CE0682
+       vadduwm 13,13,30
+       .long   0x13CB7E82
+       vadduwm 13,13,30
+       vadduwm 13,13,26
+       vadduwm 3,3,12
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       .long   0x13CF0682
+       vadduwm 14,14,30
+       .long   0x13CC7E82
+       vadduwm 14,14,30
+       vadduwm 14,14,27
+       vadduwm 2,2,13
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13D00682
+       vadduwm 15,15,30
+       .long   0x13CD7E82
+       vadduwm 15,15,30
+       vadduwm 15,15,8
+       vadduwm 1,1,14
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x13D10682
+       vadduwm 16,16,30
+       .long   0x13CE7E82
+       vadduwm 16,16,30
+       vadduwm 16,16,9
+       vadduwm 0,0,15
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       .long   0x13D20682
+       vadduwm 17,17,30
+       .long   0x13CF7E82
+       vadduwm 17,17,30
+       vadduwm 17,17,10
+       vadduwm 7,7,16
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       .long   0x13D30682
+       vadduwm 18,18,30
+       .long   0x13D07E82
+       vadduwm 18,18,30
+       vadduwm 18,18,11
+       vadduwm 6,6,17
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       .long   0x13D80682
+       vadduwm 19,19,30
+       .long   0x13D17E82
+       vadduwm 19,19,30
+       vadduwm 19,19,12
+       vadduwm 5,5,18
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x13D90682
+       vadduwm 24,24,30
+       .long   0x13D27E82
+       vadduwm 24,24,30
+       vadduwm 24,24,13
+       vadduwm 4,4,19
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       .long   0x13DA0682
+       vadduwm 25,25,30
+       .long   0x13D37E82
+       vadduwm 25,25,30
+       vadduwm 25,25,14
+       vadduwm 3,3,24
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       .long   0x13DB0682
+       vadduwm 26,26,30
+       .long   0x13D87E82
+       vadduwm 26,26,30
+       vadduwm 26,26,15
+       vadduwm 2,2,25
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13C80682
+       vadduwm 27,27,30
+       .long   0x13D97E82
+       vadduwm 27,27,30
+       vadduwm 27,27,16
+       vadduwm 1,1,26
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x13C90682
+       vadduwm 8,8,30
+       .long   0x13DA7E82
+       vadduwm 8,8,30
+       vadduwm 8,8,17
+       vadduwm 0,0,27
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       bdnz    .L16_xx
+
+       lvx     10,0,11
+       subic.  5,5,1
+       lvx     11,10,11
+       vadduwm 0,0,10
+       lvx     12,26,11
+       vadduwm 1,1,11
+       lvx     13,27,11
+       vadduwm 2,2,12
+       lvx     14,28,11
+       vadduwm 3,3,13
+       lvx     15,29,11
+       vadduwm 4,4,14
+       lvx     16,30,11
+       vadduwm 5,5,15
+       lvx     17,31,11
+       vadduwm 6,6,16
+       vadduwm 7,7,17
+       bne     .Loop
+       lvx     8,26,7
+       vperm   0,0,1,28
+       lvx     9,27,7
+       vperm   4,4,5,28
+       vperm   0,0,2,8
+       vperm   4,4,6,8
+       vperm   0,0,3,9
+       vperm   4,4,7,9
+       .long   0x7C001F19
+       .long   0x7C8A1F19
+       addi    11,1,207
+       mtlr    8
+       mtspr   256,12
+       lvx     24,0,11
+       lvx     25,10,11
+       lvx     26,26,11
+       lvx     27,27,11
+       lvx     28,28,11
+       lvx     29,29,11
+       lvx     30,30,11
+       lvx     31,31,11
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,6,3,0
+.long  0
+.size  .zfs_sha256_power8,.-.zfs_sha256_power8
+.size  zfs_sha256_power8,.-.zfs_sha256_power8
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    6
+       addi    6,6,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98
+.long  0x71374491,0x71374491,0x71374491,0x71374491
+.long  0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf
+.long  0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5
+.long  0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b
+.long  0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1
+.long  0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4
+.long  0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5
+.long  0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98
+.long  0x12835b01,0x12835b01,0x12835b01,0x12835b01
+.long  0x243185be,0x243185be,0x243185be,0x243185be
+.long  0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3
+.long  0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74
+.long  0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe
+.long  0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7
+.long  0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174
+.long  0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1
+.long  0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786
+.long  0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6
+.long  0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc
+.long  0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f
+.long  0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa
+.long  0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc
+.long  0x76f988da,0x76f988da,0x76f988da,0x76f988da
+.long  0x983e5152,0x983e5152,0x983e5152,0x983e5152
+.long  0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d
+.long  0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8
+.long  0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7
+.long  0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3
+.long  0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147
+.long  0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351
+.long  0x14292967,0x14292967,0x14292967,0x14292967
+.long  0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85
+.long  0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138
+.long  0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc
+.long  0x53380d13,0x53380d13,0x53380d13,0x53380d13
+.long  0x650a7354,0x650a7354,0x650a7354,0x650a7354
+.long  0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb
+.long  0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e
+.long  0x92722c85,0x92722c85,0x92722c85,0x92722c85
+.long  0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1
+.long  0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b
+.long  0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70
+.long  0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3
+.long  0xd192e819,0xd192e819,0xd192e819,0xd192e819
+.long  0xd6990624,0xd6990624,0xd6990624,0xd6990624
+.long  0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585
+.long  0x106aa070,0x106aa070,0x106aa070,0x106aa070
+.long  0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116
+.long  0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08
+.long  0x2748774c,0x2748774c,0x2748774c,0x2748774c
+.long  0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5
+.long  0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3
+.long  0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a
+.long  0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f
+.long  0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3
+.long  0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee
+.long  0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f
+.long  0x84c87814,0x84c87814,0x84c87814,0x84c87814
+.long  0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208
+.long  0x90befffa,0x90befffa,0x90befffa,0x90befffa
+.long  0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb
+.long  0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7
+.long  0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2
+.long  0,0,0,0
+.long  0x00010203,0x10111213,0x10111213,0x10111213
+.long  0x00010203,0x04050607,0x10111213,0x10111213
+.long  0x00010203,0x04050607,0x08090a0b,0x10111213
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion    2
+.text
+
+.globl zfs_sha256_power8
+.type  zfs_sha256_power8,@function
+.align 6
+zfs_sha256_power8:
+.localentry    zfs_sha256_power8,0
+
+       stdu    1,-384(1)
+       mflr    8
+       li      10,207
+       li      11,223
+       stvx    24,10,1
+       addi    10,10,32
+       li      12,-1
+       stvx    25,11,1
+       addi    11,11,32
+       stvx    26,10,1
+       addi    10,10,32
+       stvx    27,11,1
+       addi    11,11,32
+       stvx    28,10,1
+       addi    10,10,32
+       stvx    29,11,1
+       addi    11,11,32
+       stvx    30,10,1
+       stvx    31,11,1
+       li      11,-4096+255
+       stw     12,332(1)
+       li      10,0x10
+       std     26,336(1)
+       li      26,0x20
+       std     27,344(1)
+       li      27,0x30
+       std     28,352(1)
+       li      28,0x40
+       std     29,360(1)
+       li      29,0x50
+       std     30,368(1)
+       li      30,0x60
+       std     31,376(1)
+       li      31,0x70
+       std     8,400(1)
+       or      11,11,11
+
+       bl      .LPICmeup
+       addi    11,1,79
+       li      7,8
+       lvsl    31,0,7
+       vspltisb        28,0x0f
+       vxor    31,31,28
+       .long   0x7C001E19
+       .long   0x7C8A1E19
+       vsldoi  1,0,0,4
+       vsldoi  2,0,0,8
+       vsldoi  3,0,0,12
+       vsldoi  5,4,4,4
+       vsldoi  6,4,4,8
+       vsldoi  7,4,4,12
+       li      0,3
+       b       .Loop
+.align 5
+.Loop:
+       lvx     28,0,6
+       .long   0x7D002699
+       addi    4,4,16
+       mr      7,6
+       stvx    0,0,11
+       stvx    1,10,11
+       stvx    2,26,11
+       stvx    3,27,11
+       stvx    4,28,11
+       stvx    5,29,11
+       stvx    6,30,11
+       stvx    7,31,11
+       vadduwm 7,7,28
+       lvx     28,10,6
+       vperm   8,8,8,31
+       vadduwm 7,7,8
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       vsldoi  9,8,8,4
+       vadduwm 6,6,9
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       vsldoi  10,9,9,4
+       vadduwm 5,5,10
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x7D802699
+       addi    4,4,16
+       vsldoi  11,10,10,4
+       vadduwm 4,4,11
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       vperm   12,12,12,31
+       vadduwm 3,3,12
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       vsldoi  13,12,12,4
+       vadduwm 2,2,13
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       vsldoi  14,13,13,4
+       vadduwm 1,1,14
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x7E002699
+       addi    4,4,16
+       vsldoi  15,14,14,4
+       vadduwm 0,0,15
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       vperm   16,16,16,31
+       vadduwm 7,7,16
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       vsldoi  17,16,16,4
+       vadduwm 6,6,17
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       vsldoi  18,17,17,4
+       vadduwm 5,5,18
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x7F002699
+       addi    4,4,16
+       vsldoi  19,18,18,4
+       vadduwm 4,4,19
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       vperm   24,24,24,31
+       vadduwm 3,3,24
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       vsldoi  25,24,24,4
+       vadduwm 2,2,25
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       vsldoi  26,25,25,4
+       vadduwm 1,1,26
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       vsldoi  27,26,26,4
+       .long   0x13C90682
+       vadduwm 8,8,30
+       .long   0x13DA7E82
+       vadduwm 8,8,30
+       vadduwm 8,8,17
+       vadduwm 0,0,27
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       mtctr   0
+       b       .L16_xx
+.align 5
+.L16_xx:
+       .long   0x13CA0682
+       vadduwm 9,9,30
+       .long   0x13DB7E82
+       vadduwm 9,9,30
+       vadduwm 9,9,18
+       vadduwm 7,7,8
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       .long   0x13CB0682
+       vadduwm 10,10,30
+       .long   0x13C87E82
+       vadduwm 10,10,30
+       vadduwm 10,10,19
+       vadduwm 6,6,9
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       .long   0x13CC0682
+       vadduwm 11,11,30
+       .long   0x13C97E82
+       vadduwm 11,11,30
+       vadduwm 11,11,24
+       vadduwm 5,5,10
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x13CD0682
+       vadduwm 12,12,30
+       .long   0x13CA7E82
+       vadduwm 12,12,30
+       vadduwm 12,12,25
+       vadduwm 4,4,11
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       .long   0x13CE0682
+       vadduwm 13,13,30
+       .long   0x13CB7E82
+       vadduwm 13,13,30
+       vadduwm 13,13,26
+       vadduwm 3,3,12
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       .long   0x13CF0682
+       vadduwm 14,14,30
+       .long   0x13CC7E82
+       vadduwm 14,14,30
+       vadduwm 14,14,27
+       vadduwm 2,2,13
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13D00682
+       vadduwm 15,15,30
+       .long   0x13CD7E82
+       vadduwm 15,15,30
+       vadduwm 15,15,8
+       vadduwm 1,1,14
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x13D10682
+       vadduwm 16,16,30
+       .long   0x13CE7E82
+       vadduwm 16,16,30
+       vadduwm 16,16,9
+       vadduwm 0,0,15
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       .long   0x13D20682
+       vadduwm 17,17,30
+       .long   0x13CF7E82
+       vadduwm 17,17,30
+       vadduwm 17,17,10
+       vadduwm 7,7,16
+       vsel    29,6,5,4
+       vadduwm 6,6,28
+       vadduwm 7,7,29
+       .long   0x13C4FE82
+       vadduwm 7,7,30
+       vxor    29,0,1
+       vsel    29,1,2,29
+       vadduwm 3,3,7
+       .long   0x13C08682
+       vadduwm 30,30,29
+       vadduwm 7,7,30
+       lvx     28,26,7
+       .long   0x13D30682
+       vadduwm 18,18,30
+       .long   0x13D07E82
+       vadduwm 18,18,30
+       vadduwm 18,18,11
+       vadduwm 6,6,17
+       vsel    29,5,4,3
+       vadduwm 5,5,28
+       vadduwm 6,6,29
+       .long   0x13C3FE82
+       vadduwm 6,6,30
+       vxor    29,7,0
+       vsel    29,0,1,29
+       vadduwm 2,2,6
+       .long   0x13C78682
+       vadduwm 30,30,29
+       vadduwm 6,6,30
+       lvx     28,27,7
+       .long   0x13D80682
+       vadduwm 19,19,30
+       .long   0x13D17E82
+       vadduwm 19,19,30
+       vadduwm 19,19,12
+       vadduwm 5,5,18
+       vsel    29,4,3,2
+       vadduwm 4,4,28
+       vadduwm 5,5,29
+       .long   0x13C2FE82
+       vadduwm 5,5,30
+       vxor    29,6,7
+       vsel    29,7,0,29
+       vadduwm 1,1,5
+       .long   0x13C68682
+       vadduwm 30,30,29
+       vadduwm 5,5,30
+       lvx     28,28,7
+       .long   0x13D90682
+       vadduwm 24,24,30
+       .long   0x13D27E82
+       vadduwm 24,24,30
+       vadduwm 24,24,13
+       vadduwm 4,4,19
+       vsel    29,3,2,1
+       vadduwm 3,3,28
+       vadduwm 4,4,29
+       .long   0x13C1FE82
+       vadduwm 4,4,30
+       vxor    29,5,6
+       vsel    29,6,7,29
+       vadduwm 0,0,4
+       .long   0x13C58682
+       vadduwm 30,30,29
+       vadduwm 4,4,30
+       lvx     28,29,7
+       .long   0x13DA0682
+       vadduwm 25,25,30
+       .long   0x13D37E82
+       vadduwm 25,25,30
+       vadduwm 25,25,14
+       vadduwm 3,3,24
+       vsel    29,2,1,0
+       vadduwm 2,2,28
+       vadduwm 3,3,29
+       .long   0x13C0FE82
+       vadduwm 3,3,30
+       vxor    29,4,5
+       vsel    29,5,6,29
+       vadduwm 7,7,3
+       .long   0x13C48682
+       vadduwm 30,30,29
+       vadduwm 3,3,30
+       lvx     28,30,7
+       .long   0x13DB0682
+       vadduwm 26,26,30
+       .long   0x13D87E82
+       vadduwm 26,26,30
+       vadduwm 26,26,15
+       vadduwm 2,2,25
+       vsel    29,1,0,7
+       vadduwm 1,1,28
+       vadduwm 2,2,29
+       .long   0x13C7FE82
+       vadduwm 2,2,30
+       vxor    29,3,4
+       vsel    29,4,5,29
+       vadduwm 6,6,2
+       .long   0x13C38682
+       vadduwm 30,30,29
+       vadduwm 2,2,30
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13C80682
+       vadduwm 27,27,30
+       .long   0x13D97E82
+       vadduwm 27,27,30
+       vadduwm 27,27,16
+       vadduwm 1,1,26
+       vsel    29,0,7,6
+       vadduwm 0,0,28
+       vadduwm 1,1,29
+       .long   0x13C6FE82
+       vadduwm 1,1,30
+       vxor    29,2,3
+       vsel    29,3,4,29
+       vadduwm 5,5,1
+       .long   0x13C28682
+       vadduwm 30,30,29
+       vadduwm 1,1,30
+       lvx     28,0,7
+       .long   0x13C90682
+       vadduwm 8,8,30
+       .long   0x13DA7E82
+       vadduwm 8,8,30
+       vadduwm 8,8,17
+       vadduwm 0,0,27
+       vsel    29,7,6,5
+       vadduwm 7,7,28
+       vadduwm 0,0,29
+       .long   0x13C5FE82
+       vadduwm 0,0,30
+       vxor    29,1,2
+       vsel    29,2,3,29
+       vadduwm 4,4,0
+       .long   0x13C18682
+       vadduwm 30,30,29
+       vadduwm 0,0,30
+       lvx     28,10,7
+       bdnz    .L16_xx
+
+       lvx     10,0,11
+       subic.  5,5,1
+       lvx     11,10,11
+       vadduwm 0,0,10
+       lvx     12,26,11
+       vadduwm 1,1,11
+       lvx     13,27,11
+       vadduwm 2,2,12
+       lvx     14,28,11
+       vadduwm 3,3,13
+       lvx     15,29,11
+       vadduwm 4,4,14
+       lvx     16,30,11
+       vadduwm 5,5,15
+       lvx     17,31,11
+       vadduwm 6,6,16
+       vadduwm 7,7,17
+       bne     .Loop
+       lvx     8,26,7
+       vperm   0,0,1,28
+       lvx     9,27,7
+       vperm   4,4,5,28
+       vperm   0,0,2,8
+       vperm   4,4,6,8
+       vperm   0,0,3,9
+       vperm   4,4,7,9
+       .long   0x7C001F19
+       .long   0x7C8A1F19
+       addi    11,1,207
+       mtlr    8
+       or      12,12,12
+       lvx     24,0,11
+       lvx     25,10,11
+       lvx     26,26,11
+       lvx     27,27,11
+       lvx     28,28,11
+       lvx     29,29,11
+       lvx     30,30,11
+       lvx     31,31,11
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,6,3,0
+.long  0
+.size  zfs_sha256_power8,.-zfs_sha256_power8
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    6
+       addi    6,6,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0x428a2f98,0x428a2f98,0x428a2f98
+.long  0x71374491,0x71374491,0x71374491,0x71374491
+.long  0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf,0xb5c0fbcf
+.long  0xe9b5dba5,0xe9b5dba5,0xe9b5dba5,0xe9b5dba5
+.long  0x3956c25b,0x3956c25b,0x3956c25b,0x3956c25b
+.long  0x59f111f1,0x59f111f1,0x59f111f1,0x59f111f1
+.long  0x923f82a4,0x923f82a4,0x923f82a4,0x923f82a4
+.long  0xab1c5ed5,0xab1c5ed5,0xab1c5ed5,0xab1c5ed5
+.long  0xd807aa98,0xd807aa98,0xd807aa98,0xd807aa98
+.long  0x12835b01,0x12835b01,0x12835b01,0x12835b01
+.long  0x243185be,0x243185be,0x243185be,0x243185be
+.long  0x550c7dc3,0x550c7dc3,0x550c7dc3,0x550c7dc3
+.long  0x72be5d74,0x72be5d74,0x72be5d74,0x72be5d74
+.long  0x80deb1fe,0x80deb1fe,0x80deb1fe,0x80deb1fe
+.long  0x9bdc06a7,0x9bdc06a7,0x9bdc06a7,0x9bdc06a7
+.long  0xc19bf174,0xc19bf174,0xc19bf174,0xc19bf174
+.long  0xe49b69c1,0xe49b69c1,0xe49b69c1,0xe49b69c1
+.long  0xefbe4786,0xefbe4786,0xefbe4786,0xefbe4786
+.long  0x0fc19dc6,0x0fc19dc6,0x0fc19dc6,0x0fc19dc6
+.long  0x240ca1cc,0x240ca1cc,0x240ca1cc,0x240ca1cc
+.long  0x2de92c6f,0x2de92c6f,0x2de92c6f,0x2de92c6f
+.long  0x4a7484aa,0x4a7484aa,0x4a7484aa,0x4a7484aa
+.long  0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc,0x5cb0a9dc
+.long  0x76f988da,0x76f988da,0x76f988da,0x76f988da
+.long  0x983e5152,0x983e5152,0x983e5152,0x983e5152
+.long  0xa831c66d,0xa831c66d,0xa831c66d,0xa831c66d
+.long  0xb00327c8,0xb00327c8,0xb00327c8,0xb00327c8
+.long  0xbf597fc7,0xbf597fc7,0xbf597fc7,0xbf597fc7
+.long  0xc6e00bf3,0xc6e00bf3,0xc6e00bf3,0xc6e00bf3
+.long  0xd5a79147,0xd5a79147,0xd5a79147,0xd5a79147
+.long  0x06ca6351,0x06ca6351,0x06ca6351,0x06ca6351
+.long  0x14292967,0x14292967,0x14292967,0x14292967
+.long  0x27b70a85,0x27b70a85,0x27b70a85,0x27b70a85
+.long  0x2e1b2138,0x2e1b2138,0x2e1b2138,0x2e1b2138
+.long  0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc,0x4d2c6dfc
+.long  0x53380d13,0x53380d13,0x53380d13,0x53380d13
+.long  0x650a7354,0x650a7354,0x650a7354,0x650a7354
+.long  0x766a0abb,0x766a0abb,0x766a0abb,0x766a0abb
+.long  0x81c2c92e,0x81c2c92e,0x81c2c92e,0x81c2c92e
+.long  0x92722c85,0x92722c85,0x92722c85,0x92722c85
+.long  0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1,0xa2bfe8a1
+.long  0xa81a664b,0xa81a664b,0xa81a664b,0xa81a664b
+.long  0xc24b8b70,0xc24b8b70,0xc24b8b70,0xc24b8b70
+.long  0xc76c51a3,0xc76c51a3,0xc76c51a3,0xc76c51a3
+.long  0xd192e819,0xd192e819,0xd192e819,0xd192e819
+.long  0xd6990624,0xd6990624,0xd6990624,0xd6990624
+.long  0xf40e3585,0xf40e3585,0xf40e3585,0xf40e3585
+.long  0x106aa070,0x106aa070,0x106aa070,0x106aa070
+.long  0x19a4c116,0x19a4c116,0x19a4c116,0x19a4c116
+.long  0x1e376c08,0x1e376c08,0x1e376c08,0x1e376c08
+.long  0x2748774c,0x2748774c,0x2748774c,0x2748774c
+.long  0x34b0bcb5,0x34b0bcb5,0x34b0bcb5,0x34b0bcb5
+.long  0x391c0cb3,0x391c0cb3,0x391c0cb3,0x391c0cb3
+.long  0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a,0x4ed8aa4a
+.long  0x5b9cca4f,0x5b9cca4f,0x5b9cca4f,0x5b9cca4f
+.long  0x682e6ff3,0x682e6ff3,0x682e6ff3,0x682e6ff3
+.long  0x748f82ee,0x748f82ee,0x748f82ee,0x748f82ee
+.long  0x78a5636f,0x78a5636f,0x78a5636f,0x78a5636f
+.long  0x84c87814,0x84c87814,0x84c87814,0x84c87814
+.long  0x8cc70208,0x8cc70208,0x8cc70208,0x8cc70208
+.long  0x90befffa,0x90befffa,0x90befffa,0x90befffa
+.long  0xa4506ceb,0xa4506ceb,0xa4506ceb,0xa4506ceb
+.long  0xbef9a3f7,0xbef9a3f7,0xbef9a3f7,0xbef9a3f7
+.long  0xc67178f2,0xc67178f2,0xc67178f2,0xc67178f2
+.long  0,0,0,0
+.long  0x10111213,0x10111213,0x10111213,0x00010203
+.long  0x10111213,0x10111213,0x04050607,0x00010203
+.long  0x10111213,0x08090a0b,0x04050607,0x00010203
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha256-ppc.S b/module/icp/asm-ppc64/sha2/sha256-ppc.S
new file mode 100644 (file)
index 0000000..2219e31
--- /dev/null
@@ -0,0 +1,2712 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha256_ppc
+.globl .zfs_sha256_ppc
+.type  zfs_sha256_ppc,@function
+.section       ".opd","aw"
+.align 3
+zfs_sha256_ppc:
+.quad  .zfs_sha256_ppc,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha256_ppc:
+       stdu    1,-320(1)
+       mflr    0
+       sldi    5,5,6
+
+       std     3,144(1)
+
+       std     14,176(1)
+       std     15,184(1)
+       std     16,192(1)
+       std     17,200(1)
+       std     18,208(1)
+       std     19,216(1)
+       std     20,224(1)
+       std     21,232(1)
+       std     22,240(1)
+       std     23,248(1)
+       std     24,256(1)
+       std     25,264(1)
+       std     26,272(1)
+       std     27,280(1)
+       std     28,288(1)
+       std     29,296(1)
+       std     30,304(1)
+       std     31,312(1)
+       std     0,336(1)
+       lwz     8,0(3)
+       mr      31,4
+       lwz     9,4(3)
+       lwz     10,8(3)
+       lwz     11,12(3)
+       lwz     12,16(3)
+       lwz     6,20(3)
+       lwz     14,24(3)
+       lwz     15,28(3)
+       bl      .LPICmeup
+.LPICedup:
+       andi.   0,31,3
+       bne     .Lunaligned
+.Laligned:
+       add     5,31,5
+       std     5,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+       b       .Ldone
+
+.align 4
+.Lunaligned:
+       subfic  0,31,4096
+       andi.   0,0,4032
+       beq     .Lcross_page
+       cmpld   5,0
+       ble     .Laligned
+       subfc   5,0,5
+       add     0,31,0
+       std     5,120(1)
+       std     0,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+
+       ld      5,120(1)
+.Lcross_page:
+       li      0,16
+       mtctr   0
+       addi    20,1,48
+.Lmemcpy:
+       lbz     16,0(31)
+       lbz     17,1(31)
+       lbz     18,2(31)
+       lbz     19,3(31)
+       addi    31,31,4
+       stb     16,0(20)
+       stb     17,1(20)
+       stb     18,2(20)
+       stb     19,3(20)
+       addi    20,20,4
+       bdnz    .Lmemcpy
+       std     31,112(1)
+       addi    0,1,112
+       addi    31,1,48
+       std     5,120(1)
+       std     0,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+       ld      31,112(1)
+       ld      5,120(1)
+       addic.  5,5,-64
+       bne     .Lunaligned
+
+.Ldone:
+       ld      0,336(1)
+       ld      14,176(1)
+       ld      15,184(1)
+       ld      16,192(1)
+       ld      17,200(1)
+       ld      18,208(1)
+       ld      19,216(1)
+       ld      20,224(1)
+       ld      21,232(1)
+       ld      22,240(1)
+       ld      23,248(1)
+       ld      24,256(1)
+       ld      25,264(1)
+       ld      26,272(1)
+       ld      27,280(1)
+       ld      28,288(1)
+       ld      29,296(1)
+       ld      30,304(1)
+       ld      31,312(1)
+       mtlr    0
+       addi    1,1,320
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,18,3,0
+.long  0
+.align 4
+.Lsha2_block_private:
+       lwz     0,0(7)
+       lwz     16,0(31)
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       lwz     0,4(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     17,4(31)
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       lwz     0,8(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     18,8(31)
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       lwz     0,12(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     19,12(31)
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       lwz     0,16(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     20,16(31)
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       lwz     0,20(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     21,20(31)
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       lwz     0,24(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     22,24(31)
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       lwz     0,28(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     23,28(31)
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       lwz     0,32(7)
+       add     8,8,3
+       add     8,8,5
+
+       lwz     24,32(31)
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       lwz     0,36(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     25,36(31)
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       lwz     0,40(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     26,40(31)
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       lwz     0,44(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     27,44(31)
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       lwz     0,48(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     28,48(31)
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       lwz     0,52(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     29,52(31)
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       lwz     0,56(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     30,56(31)
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       lwz     0,60(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     31,60(31)
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       li      5,3
+       mtctr   5
+.align 4
+.Lrounds:
+       addi    7,7,64
+       rotrwi  3,17,7
+       rotrwi  4,17,18
+       rotrwi  5,30,17
+       rotrwi  0,30,19
+       xor     3,3,4
+       srwi    4,17,3
+       xor     5,5,0
+       srwi    0,30,10
+       add     16,16,25
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,0(7)
+       add     16,16,3
+       add     16,16,5
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrwi  3,18,7
+       rotrwi  4,18,18
+       rotrwi  5,31,17
+       rotrwi  0,31,19
+       xor     3,3,4
+       srwi    4,18,3
+       xor     5,5,0
+       srwi    0,31,10
+       add     17,17,26
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,4(7)
+       add     17,17,3
+       add     17,17,5
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrwi  3,19,7
+       rotrwi  4,19,18
+       rotrwi  5,16,17
+       rotrwi  0,16,19
+       xor     3,3,4
+       srwi    4,19,3
+       xor     5,5,0
+       srwi    0,16,10
+       add     18,18,27
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,8(7)
+       add     18,18,3
+       add     18,18,5
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrwi  3,20,7
+       rotrwi  4,20,18
+       rotrwi  5,17,17
+       rotrwi  0,17,19
+       xor     3,3,4
+       srwi    4,20,3
+       xor     5,5,0
+       srwi    0,17,10
+       add     19,19,28
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,12(7)
+       add     19,19,3
+       add     19,19,5
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrwi  3,21,7
+       rotrwi  4,21,18
+       rotrwi  5,18,17
+       rotrwi  0,18,19
+       xor     3,3,4
+       srwi    4,21,3
+       xor     5,5,0
+       srwi    0,18,10
+       add     20,20,29
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,16(7)
+       add     20,20,3
+       add     20,20,5
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrwi  3,22,7
+       rotrwi  4,22,18
+       rotrwi  5,19,17
+       rotrwi  0,19,19
+       xor     3,3,4
+       srwi    4,22,3
+       xor     5,5,0
+       srwi    0,19,10
+       add     21,21,30
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,20(7)
+       add     21,21,3
+       add     21,21,5
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrwi  3,23,7
+       rotrwi  4,23,18
+       rotrwi  5,20,17
+       rotrwi  0,20,19
+       xor     3,3,4
+       srwi    4,23,3
+       xor     5,5,0
+       srwi    0,20,10
+       add     22,22,31
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,24(7)
+       add     22,22,3
+       add     22,22,5
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrwi  3,24,7
+       rotrwi  4,24,18
+       rotrwi  5,21,17
+       rotrwi  0,21,19
+       xor     3,3,4
+       srwi    4,24,3
+       xor     5,5,0
+       srwi    0,21,10
+       add     23,23,16
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,28(7)
+       add     23,23,3
+       add     23,23,5
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       rotrwi  3,25,7
+       rotrwi  4,25,18
+       rotrwi  5,22,17
+       rotrwi  0,22,19
+       xor     3,3,4
+       srwi    4,25,3
+       xor     5,5,0
+       srwi    0,22,10
+       add     24,24,17
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,32(7)
+       add     24,24,3
+       add     24,24,5
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrwi  3,26,7
+       rotrwi  4,26,18
+       rotrwi  5,23,17
+       rotrwi  0,23,19
+       xor     3,3,4
+       srwi    4,26,3
+       xor     5,5,0
+       srwi    0,23,10
+       add     25,25,18
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,36(7)
+       add     25,25,3
+       add     25,25,5
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrwi  3,27,7
+       rotrwi  4,27,18
+       rotrwi  5,24,17
+       rotrwi  0,24,19
+       xor     3,3,4
+       srwi    4,27,3
+       xor     5,5,0
+       srwi    0,24,10
+       add     26,26,19
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,40(7)
+       add     26,26,3
+       add     26,26,5
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrwi  3,28,7
+       rotrwi  4,28,18
+       rotrwi  5,25,17
+       rotrwi  0,25,19
+       xor     3,3,4
+       srwi    4,28,3
+       xor     5,5,0
+       srwi    0,25,10
+       add     27,27,20
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,44(7)
+       add     27,27,3
+       add     27,27,5
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrwi  3,29,7
+       rotrwi  4,29,18
+       rotrwi  5,26,17
+       rotrwi  0,26,19
+       xor     3,3,4
+       srwi    4,29,3
+       xor     5,5,0
+       srwi    0,26,10
+       add     28,28,21
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,48(7)
+       add     28,28,3
+       add     28,28,5
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrwi  3,30,7
+       rotrwi  4,30,18
+       rotrwi  5,27,17
+       rotrwi  0,27,19
+       xor     3,3,4
+       srwi    4,30,3
+       xor     5,5,0
+       srwi    0,27,10
+       add     29,29,22
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,52(7)
+       add     29,29,3
+       add     29,29,5
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrwi  3,31,7
+       rotrwi  4,31,18
+       rotrwi  5,28,17
+       rotrwi  0,28,19
+       xor     3,3,4
+       srwi    4,31,3
+       xor     5,5,0
+       srwi    0,28,10
+       add     30,30,23
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,56(7)
+       add     30,30,3
+       add     30,30,5
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrwi  3,16,7
+       rotrwi  4,16,18
+       rotrwi  5,29,17
+       rotrwi  0,29,19
+       xor     3,3,4
+       srwi    4,16,3
+       xor     5,5,0
+       srwi    0,29,10
+       add     31,31,24
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,60(7)
+       add     31,31,3
+       add     31,31,5
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       bdnz    .Lrounds
+
+       ld      3,144(1)
+       ld      31,136(1)
+       ld      5,128(1)
+       subi    7,7,192
+
+       lwz     16,0(3)
+       lwz     17,4(3)
+       lwz     18,8(3)
+       lwz     19,12(3)
+       lwz     20,16(3)
+       lwz     21,20(3)
+       lwz     22,24(3)
+       addi    31,31,64
+       lwz     23,28(3)
+       add     8,8,16
+       add     9,9,17
+       std     31,136(1)
+       add     10,10,18
+       stw     8,0(3)
+       add     11,11,19
+       stw     9,4(3)
+       add     12,12,20
+       stw     10,8(3)
+       add     6,6,21
+       stw     11,12(3)
+       add     14,14,22
+       stw     12,16(3)
+       add     15,15,23
+       stw     6,20(3)
+       stw     14,24(3)
+       cmpld   31,5
+       stw     15,28(3)
+       bne     .Lsha2_block_private
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.size  .zfs_sha256_ppc,.-.zfs_sha256_ppc
+.size  zfs_sha256_ppc,.-.zfs_sha256_ppc
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    7
+       addi    7,7,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion    2
+.text
+
+.globl zfs_sha256_ppc
+.type  zfs_sha256_ppc,@function
+.align 6
+zfs_sha256_ppc:
+.localentry    zfs_sha256_ppc,0
+
+       stdu    1,-320(1)
+       mflr    0
+       sldi    5,5,6
+
+       std     3,144(1)
+
+       std     14,176(1)
+       std     15,184(1)
+       std     16,192(1)
+       std     17,200(1)
+       std     18,208(1)
+       std     19,216(1)
+       std     20,224(1)
+       std     21,232(1)
+       std     22,240(1)
+       std     23,248(1)
+       std     24,256(1)
+       std     25,264(1)
+       std     26,272(1)
+       std     27,280(1)
+       std     28,288(1)
+       std     29,296(1)
+       std     30,304(1)
+       std     31,312(1)
+       std     0,336(1)
+       lwz     8,0(3)
+       mr      31,4
+       lwz     9,4(3)
+       lwz     10,8(3)
+       lwz     11,12(3)
+       lwz     12,16(3)
+       lwz     6,20(3)
+       lwz     14,24(3)
+       lwz     15,28(3)
+       bl      .LPICmeup
+.LPICedup:
+       andi.   0,31,3
+       bne     .Lunaligned
+.Laligned:
+       add     5,31,5
+       std     5,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+       b       .Ldone
+
+.align 4
+.Lunaligned:
+       subfic  0,31,4096
+       andi.   0,0,4032
+       beq     .Lcross_page
+       cmpld   5,0
+       ble     .Laligned
+       subfc   5,0,5
+       add     0,31,0
+       std     5,120(1)
+       std     0,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+
+       ld      5,120(1)
+.Lcross_page:
+       li      0,16
+       mtctr   0
+       addi    20,1,48
+.Lmemcpy:
+       lbz     16,0(31)
+       lbz     17,1(31)
+       lbz     18,2(31)
+       lbz     19,3(31)
+       addi    31,31,4
+       stb     16,0(20)
+       stb     17,1(20)
+       stb     18,2(20)
+       stb     19,3(20)
+       addi    20,20,4
+       bdnz    .Lmemcpy
+       std     31,112(1)
+       addi    0,1,112
+       addi    31,1,48
+       std     5,120(1)
+       std     0,128(1)
+       std     31,136(1)
+       bl      .Lsha2_block_private
+       ld      31,112(1)
+       ld      5,120(1)
+       addic.  5,5,-64
+       bne     .Lunaligned
+
+.Ldone:
+       ld      0,336(1)
+       ld      14,176(1)
+       ld      15,184(1)
+       ld      16,192(1)
+       ld      17,200(1)
+       ld      18,208(1)
+       ld      19,216(1)
+       ld      20,224(1)
+       ld      21,232(1)
+       ld      22,240(1)
+       ld      23,248(1)
+       ld      24,256(1)
+       ld      25,264(1)
+       ld      26,272(1)
+       ld      27,280(1)
+       ld      28,288(1)
+       ld      29,296(1)
+       ld      30,304(1)
+       ld      31,312(1)
+       mtlr    0
+       addi    1,1,320
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,18,3,0
+.long  0
+.align 4
+.Lsha2_block_private:
+       lwz     0,0(7)
+       lwz     3,0(31)
+       rotlwi  16,3,8
+       rlwimi  16,3,24,0,7
+       rlwimi  16,3,24,16,23
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       lwz     0,4(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     3,4(31)
+       rotlwi  17,3,8
+       rlwimi  17,3,24,0,7
+       rlwimi  17,3,24,16,23
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       lwz     0,8(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     3,8(31)
+       rotlwi  18,3,8
+       rlwimi  18,3,24,0,7
+       rlwimi  18,3,24,16,23
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       lwz     0,12(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     3,12(31)
+       rotlwi  19,3,8
+       rlwimi  19,3,24,0,7
+       rlwimi  19,3,24,16,23
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       lwz     0,16(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     3,16(31)
+       rotlwi  20,3,8
+       rlwimi  20,3,24,0,7
+       rlwimi  20,3,24,16,23
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       lwz     0,20(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     3,20(31)
+       rotlwi  21,3,8
+       rlwimi  21,3,24,0,7
+       rlwimi  21,3,24,16,23
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       lwz     0,24(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     3,24(31)
+       rotlwi  22,3,8
+       rlwimi  22,3,24,0,7
+       rlwimi  22,3,24,16,23
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       lwz     0,28(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     3,28(31)
+       rotlwi  23,3,8
+       rlwimi  23,3,24,0,7
+       rlwimi  23,3,24,16,23
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       lwz     0,32(7)
+       add     8,8,3
+       add     8,8,5
+
+       lwz     3,32(31)
+       rotlwi  24,3,8
+       rlwimi  24,3,24,0,7
+       rlwimi  24,3,24,16,23
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       lwz     0,36(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     3,36(31)
+       rotlwi  25,3,8
+       rlwimi  25,3,24,0,7
+       rlwimi  25,3,24,16,23
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       lwz     0,40(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     3,40(31)
+       rotlwi  26,3,8
+       rlwimi  26,3,24,0,7
+       rlwimi  26,3,24,16,23
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       lwz     0,44(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     3,44(31)
+       rotlwi  27,3,8
+       rlwimi  27,3,24,0,7
+       rlwimi  27,3,24,16,23
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       lwz     0,48(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     3,48(31)
+       rotlwi  28,3,8
+       rlwimi  28,3,24,0,7
+       rlwimi  28,3,24,16,23
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       lwz     0,52(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     3,52(31)
+       rotlwi  29,3,8
+       rlwimi  29,3,24,0,7
+       rlwimi  29,3,24,16,23
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       lwz     0,56(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     3,56(31)
+       rotlwi  30,3,8
+       rlwimi  30,3,24,0,7
+       rlwimi  30,3,24,16,23
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       lwz     0,60(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     3,60(31)
+       rotlwi  31,3,8
+       rlwimi  31,3,24,0,7
+       rlwimi  31,3,24,16,23
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       li      5,3
+       mtctr   5
+.align 4
+.Lrounds:
+       addi    7,7,64
+       rotrwi  3,17,7
+       rotrwi  4,17,18
+       rotrwi  5,30,17
+       rotrwi  0,30,19
+       xor     3,3,4
+       srwi    4,17,3
+       xor     5,5,0
+       srwi    0,30,10
+       add     16,16,25
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,0(7)
+       add     16,16,3
+       add     16,16,5
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrwi  3,18,7
+       rotrwi  4,18,18
+       rotrwi  5,31,17
+       rotrwi  0,31,19
+       xor     3,3,4
+       srwi    4,18,3
+       xor     5,5,0
+       srwi    0,31,10
+       add     17,17,26
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,4(7)
+       add     17,17,3
+       add     17,17,5
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrwi  3,19,7
+       rotrwi  4,19,18
+       rotrwi  5,16,17
+       rotrwi  0,16,19
+       xor     3,3,4
+       srwi    4,19,3
+       xor     5,5,0
+       srwi    0,16,10
+       add     18,18,27
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,8(7)
+       add     18,18,3
+       add     18,18,5
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrwi  3,20,7
+       rotrwi  4,20,18
+       rotrwi  5,17,17
+       rotrwi  0,17,19
+       xor     3,3,4
+       srwi    4,20,3
+       xor     5,5,0
+       srwi    0,17,10
+       add     19,19,28
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,12(7)
+       add     19,19,3
+       add     19,19,5
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrwi  3,21,7
+       rotrwi  4,21,18
+       rotrwi  5,18,17
+       rotrwi  0,18,19
+       xor     3,3,4
+       srwi    4,21,3
+       xor     5,5,0
+       srwi    0,18,10
+       add     20,20,29
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,16(7)
+       add     20,20,3
+       add     20,20,5
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrwi  3,22,7
+       rotrwi  4,22,18
+       rotrwi  5,19,17
+       rotrwi  0,19,19
+       xor     3,3,4
+       srwi    4,22,3
+       xor     5,5,0
+       srwi    0,19,10
+       add     21,21,30
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,20(7)
+       add     21,21,3
+       add     21,21,5
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrwi  3,23,7
+       rotrwi  4,23,18
+       rotrwi  5,20,17
+       rotrwi  0,20,19
+       xor     3,3,4
+       srwi    4,23,3
+       xor     5,5,0
+       srwi    0,20,10
+       add     22,22,31
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,24(7)
+       add     22,22,3
+       add     22,22,5
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrwi  3,24,7
+       rotrwi  4,24,18
+       rotrwi  5,21,17
+       rotrwi  0,21,19
+       xor     3,3,4
+       srwi    4,24,3
+       xor     5,5,0
+       srwi    0,21,10
+       add     23,23,16
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,28(7)
+       add     23,23,3
+       add     23,23,5
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       rotrwi  3,25,7
+       rotrwi  4,25,18
+       rotrwi  5,22,17
+       rotrwi  0,22,19
+       xor     3,3,4
+       srwi    4,25,3
+       xor     5,5,0
+       srwi    0,22,10
+       add     24,24,17
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,32(7)
+       add     24,24,3
+       add     24,24,5
+       rotrwi  3,12,6
+       rotrwi  4,12,11
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrwi  4,4,14
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrwi  3,8,2
+       rotrwi  4,8,13
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrwi  3,26,7
+       rotrwi  4,26,18
+       rotrwi  5,23,17
+       rotrwi  0,23,19
+       xor     3,3,4
+       srwi    4,26,3
+       xor     5,5,0
+       srwi    0,23,10
+       add     25,25,18
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,36(7)
+       add     25,25,3
+       add     25,25,5
+       rotrwi  3,11,6
+       rotrwi  4,11,11
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrwi  4,4,14
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrwi  3,15,2
+       rotrwi  4,15,13
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrwi  3,27,7
+       rotrwi  4,27,18
+       rotrwi  5,24,17
+       rotrwi  0,24,19
+       xor     3,3,4
+       srwi    4,27,3
+       xor     5,5,0
+       srwi    0,24,10
+       add     26,26,19
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,40(7)
+       add     26,26,3
+       add     26,26,5
+       rotrwi  3,10,6
+       rotrwi  4,10,11
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrwi  4,4,14
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrwi  3,14,2
+       rotrwi  4,14,13
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrwi  3,28,7
+       rotrwi  4,28,18
+       rotrwi  5,25,17
+       rotrwi  0,25,19
+       xor     3,3,4
+       srwi    4,28,3
+       xor     5,5,0
+       srwi    0,25,10
+       add     27,27,20
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,44(7)
+       add     27,27,3
+       add     27,27,5
+       rotrwi  3,9,6
+       rotrwi  4,9,11
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrwi  4,4,14
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrwi  3,6,2
+       rotrwi  4,6,13
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrwi  3,29,7
+       rotrwi  4,29,18
+       rotrwi  5,26,17
+       rotrwi  0,26,19
+       xor     3,3,4
+       srwi    4,29,3
+       xor     5,5,0
+       srwi    0,26,10
+       add     28,28,21
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,48(7)
+       add     28,28,3
+       add     28,28,5
+       rotrwi  3,8,6
+       rotrwi  4,8,11
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrwi  4,4,14
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrwi  3,12,2
+       rotrwi  4,12,13
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrwi  3,30,7
+       rotrwi  4,30,18
+       rotrwi  5,27,17
+       rotrwi  0,27,19
+       xor     3,3,4
+       srwi    4,30,3
+       xor     5,5,0
+       srwi    0,27,10
+       add     29,29,22
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,52(7)
+       add     29,29,3
+       add     29,29,5
+       rotrwi  3,15,6
+       rotrwi  4,15,11
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrwi  4,4,14
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrwi  3,11,2
+       rotrwi  4,11,13
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrwi  3,31,7
+       rotrwi  4,31,18
+       rotrwi  5,28,17
+       rotrwi  0,28,19
+       xor     3,3,4
+       srwi    4,31,3
+       xor     5,5,0
+       srwi    0,28,10
+       add     30,30,23
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,56(7)
+       add     30,30,3
+       add     30,30,5
+       rotrwi  3,14,6
+       rotrwi  4,14,11
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrwi  4,4,14
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrwi  3,10,2
+       rotrwi  4,10,13
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrwi  3,16,7
+       rotrwi  4,16,18
+       rotrwi  5,29,17
+       rotrwi  0,29,19
+       xor     3,3,4
+       srwi    4,16,3
+       xor     5,5,0
+       srwi    0,29,10
+       add     31,31,24
+       xor     3,3,4
+       xor     5,5,0
+       lwz     0,60(7)
+       add     31,31,3
+       add     31,31,5
+       rotrwi  3,6,6
+       rotrwi  4,6,11
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrwi  4,4,14
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrwi  3,9,2
+       rotrwi  4,9,13
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrwi  4,4,9
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       bdnz    .Lrounds
+
+       ld      3,144(1)
+       ld      31,136(1)
+       ld      5,128(1)
+       subi    7,7,192
+
+       lwz     16,0(3)
+       lwz     17,4(3)
+       lwz     18,8(3)
+       lwz     19,12(3)
+       lwz     20,16(3)
+       lwz     21,20(3)
+       lwz     22,24(3)
+       addi    31,31,64
+       lwz     23,28(3)
+       add     8,8,16
+       add     9,9,17
+       std     31,136(1)
+       add     10,10,18
+       stw     8,0(3)
+       add     11,11,19
+       stw     9,4(3)
+       add     12,12,20
+       stw     10,8(3)
+       add     6,6,21
+       stw     11,12(3)
+       add     14,14,22
+       stw     12,16(3)
+       add     15,15,23
+       stw     6,20(3)
+       stw     14,24(3)
+       cmpld   31,5
+       stw     15,28(3)
+       bne     .Lsha2_block_private
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.size  zfs_sha256_ppc,.-zfs_sha256_ppc
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    7
+       addi    7,7,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha512-p8.S b/module/icp/asm-ppc64/sha2/sha512-p8.S
new file mode 100644 (file)
index 0000000..39a90ed
--- /dev/null
@@ -0,0 +1,1706 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha512_power8
+.globl .zfs_sha512_power8
+.type  zfs_sha512_power8,@function
+.section       ".opd","aw"
+.align 3
+zfs_sha512_power8:
+.quad  .zfs_sha512_power8,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha512_power8:
+       stdu    1,-384(1)
+       mflr    8
+       li      10,207
+       li      11,223
+       stvx    24,10,1
+       addi    10,10,32
+       mfspr   12,256
+       stvx    25,11,1
+       addi    11,11,32
+       stvx    26,10,1
+       addi    10,10,32
+       stvx    27,11,1
+       addi    11,11,32
+       stvx    28,10,1
+       addi    10,10,32
+       stvx    29,11,1
+       addi    11,11,32
+       stvx    30,10,1
+       stvx    31,11,1
+       li      11,-4096+255
+       stw     12,332(1)
+       li      10,0x10
+       std     26,336(1)
+       li      26,0x20
+       std     27,344(1)
+       li      27,0x30
+       std     28,352(1)
+       li      28,0x40
+       std     29,360(1)
+       li      29,0x50
+       std     30,368(1)
+       li      30,0x60
+       std     31,376(1)
+       li      31,0x70
+       std     8,400(1)
+       mtspr   256,11
+
+       bl      .LPICmeup
+       addi    11,1,79
+       .long   0x7C001E99
+       .long   0x7C4A1E99
+       .long   0x7C9A1E99
+       vsldoi  1,0,0,8
+       .long   0x7CDB1E99
+       vsldoi  3,2,2,8
+       vsldoi  5,4,4,8
+       vsldoi  7,6,6,8
+       li      0,4
+       b       .Loop
+.align 5
+.Loop:
+       lvx     28,0,6
+       .long   0x7D002699
+       addi    4,4,16
+       mr      7,6
+       stvx    0,0,11
+       stvx    1,10,11
+       stvx    2,26,11
+       stvx    3,27,11
+       stvx    4,28,11
+       stvx    5,29,11
+       stvx    6,30,11
+       stvx    7,31,11
+       .long   0x10E7E0C0
+       lvx     28,10,6
+       .long   0x10E740C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x7D402699
+       addi    4,4,16
+       vsldoi  9,8,8,8
+       .long   0x10C648C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x10A550C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x7D802699
+       addi    4,4,16
+       vsldoi  11,10,10,8
+       .long   0x108458C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x106360C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x7DC02699
+       addi    4,4,16
+       vsldoi  13,12,12,8
+       .long   0x104268C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x102170C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x7E002699
+       addi    4,4,16
+       vsldoi  15,14,14,8
+       .long   0x100078C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       .long   0x10E780C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x7E402699
+       addi    4,4,16
+       vsldoi  17,16,16,8
+       .long   0x10C688C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x10A590C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x7F002699
+       addi    4,4,16
+       vsldoi  19,18,18,8
+       .long   0x108498C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x1063C0C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x7F402699
+       addi    4,4,16
+       vsldoi  25,24,24,8
+       .long   0x1042C8C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x1021D0C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       vsldoi  27,26,26,8
+       .long   0x13C906C2
+       .long   0x1108F0C0
+       .long   0x13DA7EC2
+       .long   0x1108F0C0
+       .long   0x110888C0
+       .long   0x1000D8C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       mtctr   0
+       b       .L16_xx
+.align 5
+.L16_xx:
+       .long   0x13CA06C2
+       .long   0x1129F0C0
+       .long   0x13DB7EC2
+       .long   0x1129F0C0
+       .long   0x112990C0
+       .long   0x10E740C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x13CB06C2
+       .long   0x114AF0C0
+       .long   0x13C87EC2
+       .long   0x114AF0C0
+       .long   0x114A98C0
+       .long   0x10C648C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x13CC06C2
+       .long   0x116BF0C0
+       .long   0x13C97EC2
+       .long   0x116BF0C0
+       .long   0x116BC0C0
+       .long   0x10A550C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x13CD06C2
+       .long   0x118CF0C0
+       .long   0x13CA7EC2
+       .long   0x118CF0C0
+       .long   0x118CC8C0
+       .long   0x108458C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x13CE06C2
+       .long   0x11ADF0C0
+       .long   0x13CB7EC2
+       .long   0x11ADF0C0
+       .long   0x11ADD0C0
+       .long   0x106360C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x13CF06C2
+       .long   0x11CEF0C0
+       .long   0x13CC7EC2
+       .long   0x11CEF0C0
+       .long   0x11CED8C0
+       .long   0x104268C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13D006C2
+       .long   0x11EFF0C0
+       .long   0x13CD7EC2
+       .long   0x11EFF0C0
+       .long   0x11EF40C0
+       .long   0x102170C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x13D106C2
+       .long   0x1210F0C0
+       .long   0x13CE7EC2
+       .long   0x1210F0C0
+       .long   0x121048C0
+       .long   0x100078C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       .long   0x13D206C2
+       .long   0x1231F0C0
+       .long   0x13CF7EC2
+       .long   0x1231F0C0
+       .long   0x123150C0
+       .long   0x10E780C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x13D306C2
+       .long   0x1252F0C0
+       .long   0x13D07EC2
+       .long   0x1252F0C0
+       .long   0x125258C0
+       .long   0x10C688C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x13D806C2
+       .long   0x1273F0C0
+       .long   0x13D17EC2
+       .long   0x1273F0C0
+       .long   0x127360C0
+       .long   0x10A590C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x13D906C2
+       .long   0x1318F0C0
+       .long   0x13D27EC2
+       .long   0x1318F0C0
+       .long   0x131868C0
+       .long   0x108498C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x13DA06C2
+       .long   0x1339F0C0
+       .long   0x13D37EC2
+       .long   0x1339F0C0
+       .long   0x133970C0
+       .long   0x1063C0C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x13DB06C2
+       .long   0x135AF0C0
+       .long   0x13D87EC2
+       .long   0x135AF0C0
+       .long   0x135A78C0
+       .long   0x1042C8C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13C806C2
+       .long   0x137BF0C0
+       .long   0x13D97EC2
+       .long   0x137BF0C0
+       .long   0x137B80C0
+       .long   0x1021D0C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x13C906C2
+       .long   0x1108F0C0
+       .long   0x13DA7EC2
+       .long   0x1108F0C0
+       .long   0x110888C0
+       .long   0x1000D8C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       bdnz    .L16_xx
+
+       lvx     10,0,11
+       subic.  5,5,1
+       lvx     11,10,11
+       .long   0x100050C0
+       lvx     12,26,11
+       .long   0x102158C0
+       lvx     13,27,11
+       .long   0x104260C0
+       lvx     14,28,11
+       .long   0x106368C0
+       lvx     15,29,11
+       .long   0x108470C0
+       lvx     16,30,11
+       .long   0x10A578C0
+       lvx     17,31,11
+       .long   0x10C680C0
+       .long   0x10E788C0
+       bne     .Loop
+       vperm   0,0,1,28
+       vperm   2,2,3,28
+       vperm   4,4,5,28
+       vperm   6,6,7,28
+       .long   0x7C001F99
+       .long   0x7C4A1F99
+       .long   0x7C9A1F99
+       .long   0x7CDB1F99
+       addi    11,1,207
+       mtlr    8
+       mtspr   256,12
+       lvx     24,0,11
+       lvx     25,10,11
+       lvx     26,26,11
+       lvx     27,27,11
+       lvx     28,28,11
+       lvx     29,29,11
+       lvx     30,30,11
+       lvx     31,31,11
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,6,3,0
+.long  0
+.size  .zfs_sha512_power8,.-.zfs_sha512_power8
+.size  zfs_sha512_power8,.-.zfs_sha512_power8
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    6
+       addi    6,6,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0xd728ae22
+.long  0x428a2f98,0xd728ae22
+.long  0x71374491,0x23ef65cd
+.long  0x71374491,0x23ef65cd
+.long  0xb5c0fbcf,0xec4d3b2f
+.long  0xb5c0fbcf,0xec4d3b2f
+.long  0xe9b5dba5,0x8189dbbc
+.long  0xe9b5dba5,0x8189dbbc
+.long  0x3956c25b,0xf348b538
+.long  0x3956c25b,0xf348b538
+.long  0x59f111f1,0xb605d019
+.long  0x59f111f1,0xb605d019
+.long  0x923f82a4,0xaf194f9b
+.long  0x923f82a4,0xaf194f9b
+.long  0xab1c5ed5,0xda6d8118
+.long  0xab1c5ed5,0xda6d8118
+.long  0xd807aa98,0xa3030242
+.long  0xd807aa98,0xa3030242
+.long  0x12835b01,0x45706fbe
+.long  0x12835b01,0x45706fbe
+.long  0x243185be,0x4ee4b28c
+.long  0x243185be,0x4ee4b28c
+.long  0x550c7dc3,0xd5ffb4e2
+.long  0x550c7dc3,0xd5ffb4e2
+.long  0x72be5d74,0xf27b896f
+.long  0x72be5d74,0xf27b896f
+.long  0x80deb1fe,0x3b1696b1
+.long  0x80deb1fe,0x3b1696b1
+.long  0x9bdc06a7,0x25c71235
+.long  0x9bdc06a7,0x25c71235
+.long  0xc19bf174,0xcf692694
+.long  0xc19bf174,0xcf692694
+.long  0xe49b69c1,0x9ef14ad2
+.long  0xe49b69c1,0x9ef14ad2
+.long  0xefbe4786,0x384f25e3
+.long  0xefbe4786,0x384f25e3
+.long  0x0fc19dc6,0x8b8cd5b5
+.long  0x0fc19dc6,0x8b8cd5b5
+.long  0x240ca1cc,0x77ac9c65
+.long  0x240ca1cc,0x77ac9c65
+.long  0x2de92c6f,0x592b0275
+.long  0x2de92c6f,0x592b0275
+.long  0x4a7484aa,0x6ea6e483
+.long  0x4a7484aa,0x6ea6e483
+.long  0x5cb0a9dc,0xbd41fbd4
+.long  0x5cb0a9dc,0xbd41fbd4
+.long  0x76f988da,0x831153b5
+.long  0x76f988da,0x831153b5
+.long  0x983e5152,0xee66dfab
+.long  0x983e5152,0xee66dfab
+.long  0xa831c66d,0x2db43210
+.long  0xa831c66d,0x2db43210
+.long  0xb00327c8,0x98fb213f
+.long  0xb00327c8,0x98fb213f
+.long  0xbf597fc7,0xbeef0ee4
+.long  0xbf597fc7,0xbeef0ee4
+.long  0xc6e00bf3,0x3da88fc2
+.long  0xc6e00bf3,0x3da88fc2
+.long  0xd5a79147,0x930aa725
+.long  0xd5a79147,0x930aa725
+.long  0x06ca6351,0xe003826f
+.long  0x06ca6351,0xe003826f
+.long  0x14292967,0x0a0e6e70
+.long  0x14292967,0x0a0e6e70
+.long  0x27b70a85,0x46d22ffc
+.long  0x27b70a85,0x46d22ffc
+.long  0x2e1b2138,0x5c26c926
+.long  0x2e1b2138,0x5c26c926
+.long  0x4d2c6dfc,0x5ac42aed
+.long  0x4d2c6dfc,0x5ac42aed
+.long  0x53380d13,0x9d95b3df
+.long  0x53380d13,0x9d95b3df
+.long  0x650a7354,0x8baf63de
+.long  0x650a7354,0x8baf63de
+.long  0x766a0abb,0x3c77b2a8
+.long  0x766a0abb,0x3c77b2a8
+.long  0x81c2c92e,0x47edaee6
+.long  0x81c2c92e,0x47edaee6
+.long  0x92722c85,0x1482353b
+.long  0x92722c85,0x1482353b
+.long  0xa2bfe8a1,0x4cf10364
+.long  0xa2bfe8a1,0x4cf10364
+.long  0xa81a664b,0xbc423001
+.long  0xa81a664b,0xbc423001
+.long  0xc24b8b70,0xd0f89791
+.long  0xc24b8b70,0xd0f89791
+.long  0xc76c51a3,0x0654be30
+.long  0xc76c51a3,0x0654be30
+.long  0xd192e819,0xd6ef5218
+.long  0xd192e819,0xd6ef5218
+.long  0xd6990624,0x5565a910
+.long  0xd6990624,0x5565a910
+.long  0xf40e3585,0x5771202a
+.long  0xf40e3585,0x5771202a
+.long  0x106aa070,0x32bbd1b8
+.long  0x106aa070,0x32bbd1b8
+.long  0x19a4c116,0xb8d2d0c8
+.long  0x19a4c116,0xb8d2d0c8
+.long  0x1e376c08,0x5141ab53
+.long  0x1e376c08,0x5141ab53
+.long  0x2748774c,0xdf8eeb99
+.long  0x2748774c,0xdf8eeb99
+.long  0x34b0bcb5,0xe19b48a8
+.long  0x34b0bcb5,0xe19b48a8
+.long  0x391c0cb3,0xc5c95a63
+.long  0x391c0cb3,0xc5c95a63
+.long  0x4ed8aa4a,0xe3418acb
+.long  0x4ed8aa4a,0xe3418acb
+.long  0x5b9cca4f,0x7763e373
+.long  0x5b9cca4f,0x7763e373
+.long  0x682e6ff3,0xd6b2b8a3
+.long  0x682e6ff3,0xd6b2b8a3
+.long  0x748f82ee,0x5defb2fc
+.long  0x748f82ee,0x5defb2fc
+.long  0x78a5636f,0x43172f60
+.long  0x78a5636f,0x43172f60
+.long  0x84c87814,0xa1f0ab72
+.long  0x84c87814,0xa1f0ab72
+.long  0x8cc70208,0x1a6439ec
+.long  0x8cc70208,0x1a6439ec
+.long  0x90befffa,0x23631e28
+.long  0x90befffa,0x23631e28
+.long  0xa4506ceb,0xde82bde9
+.long  0xa4506ceb,0xde82bde9
+.long  0xbef9a3f7,0xb2c67915
+.long  0xbef9a3f7,0xb2c67915
+.long  0xc67178f2,0xe372532b
+.long  0xc67178f2,0xe372532b
+.long  0xca273ece,0xea26619c
+.long  0xca273ece,0xea26619c
+.long  0xd186b8c7,0x21c0c207
+.long  0xd186b8c7,0x21c0c207
+.long  0xeada7dd6,0xcde0eb1e
+.long  0xeada7dd6,0xcde0eb1e
+.long  0xf57d4f7f,0xee6ed178
+.long  0xf57d4f7f,0xee6ed178
+.long  0x06f067aa,0x72176fba
+.long  0x06f067aa,0x72176fba
+.long  0x0a637dc5,0xa2c898a6
+.long  0x0a637dc5,0xa2c898a6
+.long  0x113f9804,0xbef90dae
+.long  0x113f9804,0xbef90dae
+.long  0x1b710b35,0x131c471b
+.long  0x1b710b35,0x131c471b
+.long  0x28db77f5,0x23047d84
+.long  0x28db77f5,0x23047d84
+.long  0x32caab7b,0x40c72493
+.long  0x32caab7b,0x40c72493
+.long  0x3c9ebe0a,0x15c9bebc
+.long  0x3c9ebe0a,0x15c9bebc
+.long  0x431d67c4,0x9c100d4c
+.long  0x431d67c4,0x9c100d4c
+.long  0x4cc5d4be,0xcb3e42b6
+.long  0x4cc5d4be,0xcb3e42b6
+.long  0x597f299c,0xfc657e2a
+.long  0x597f299c,0xfc657e2a
+.long  0x5fcb6fab,0x3ad6faec
+.long  0x5fcb6fab,0x3ad6faec
+.long  0x6c44198c,0x4a475817
+.long  0x6c44198c,0x4a475817
+.long  0,0
+.long  0,0
+.long  0x00010203,0x04050607
+.long  0x10111213,0x14151617
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion    2
+.text
+
+.globl zfs_sha512_power8
+.type  zfs_sha512_power8,@function
+.align 6
+zfs_sha512_power8:
+.localentry    zfs_sha512_power8,0
+
+       stdu    1,-384(1)
+       mflr    8
+       li      10,207
+       li      11,223
+       stvx    24,10,1
+       addi    10,10,32
+       li      12,-1
+       stvx    25,11,1
+       addi    11,11,32
+       stvx    26,10,1
+       addi    10,10,32
+       stvx    27,11,1
+       addi    11,11,32
+       stvx    28,10,1
+       addi    10,10,32
+       stvx    29,11,1
+       addi    11,11,32
+       stvx    30,10,1
+       stvx    31,11,1
+       li      11,-4096+255
+       stw     12,332(1)
+       li      10,0x10
+       std     26,336(1)
+       li      26,0x20
+       std     27,344(1)
+       li      27,0x30
+       std     28,352(1)
+       li      28,0x40
+       std     29,360(1)
+       li      29,0x50
+       std     30,368(1)
+       li      30,0x60
+       std     31,376(1)
+       li      31,0x70
+       std     8,400(1)
+       or      11,11,11
+
+       bl      .LPICmeup
+       addi    11,1,79
+       li      7,8
+       lvsl    31,0,7
+       vspltisb        28,0x0f
+       vxor    31,31,28
+       .long   0x7C001E99
+       .long   0x7C4A1E99
+       .long   0x7C9A1E99
+       vsldoi  1,0,0,8
+       .long   0x7CDB1E99
+       vsldoi  3,2,2,8
+       vsldoi  5,4,4,8
+       vsldoi  7,6,6,8
+       li      0,4
+       b       .Loop
+.align 5
+.Loop:
+       lvx     28,0,6
+       .long   0x7D002699
+       addi    4,4,16
+       mr      7,6
+       stvx    0,0,11
+       stvx    1,10,11
+       stvx    2,26,11
+       stvx    3,27,11
+       stvx    4,28,11
+       stvx    5,29,11
+       stvx    6,30,11
+       stvx    7,31,11
+       .long   0x10E7E0C0
+       lvx     28,10,6
+       vperm   8,8,8,31
+       .long   0x10E740C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x7D402699
+       addi    4,4,16
+       vsldoi  9,8,8,8
+       .long   0x10C648C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       vperm   10,10,10,31
+       .long   0x10A550C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x7D802699
+       addi    4,4,16
+       vsldoi  11,10,10,8
+       .long   0x108458C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       vperm   12,12,12,31
+       .long   0x106360C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x7DC02699
+       addi    4,4,16
+       vsldoi  13,12,12,8
+       .long   0x104268C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       vperm   14,14,14,31
+       .long   0x102170C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x7E002699
+       addi    4,4,16
+       vsldoi  15,14,14,8
+       .long   0x100078C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       vperm   16,16,16,31
+       .long   0x10E780C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x7E402699
+       addi    4,4,16
+       vsldoi  17,16,16,8
+       .long   0x10C688C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       vperm   18,18,18,31
+       .long   0x10A590C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x7F002699
+       addi    4,4,16
+       vsldoi  19,18,18,8
+       .long   0x108498C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       vperm   24,24,24,31
+       .long   0x1063C0C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x7F402699
+       addi    4,4,16
+       vsldoi  25,24,24,8
+       .long   0x1042C8C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       vperm   26,26,26,31
+       .long   0x1021D0C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       vsldoi  27,26,26,8
+       .long   0x13C906C2
+       .long   0x1108F0C0
+       .long   0x13DA7EC2
+       .long   0x1108F0C0
+       .long   0x110888C0
+       .long   0x1000D8C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       mtctr   0
+       b       .L16_xx
+.align 5
+.L16_xx:
+       .long   0x13CA06C2
+       .long   0x1129F0C0
+       .long   0x13DB7EC2
+       .long   0x1129F0C0
+       .long   0x112990C0
+       .long   0x10E740C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x13CB06C2
+       .long   0x114AF0C0
+       .long   0x13C87EC2
+       .long   0x114AF0C0
+       .long   0x114A98C0
+       .long   0x10C648C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x13CC06C2
+       .long   0x116BF0C0
+       .long   0x13C97EC2
+       .long   0x116BF0C0
+       .long   0x116BC0C0
+       .long   0x10A550C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x13CD06C2
+       .long   0x118CF0C0
+       .long   0x13CA7EC2
+       .long   0x118CF0C0
+       .long   0x118CC8C0
+       .long   0x108458C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x13CE06C2
+       .long   0x11ADF0C0
+       .long   0x13CB7EC2
+       .long   0x11ADF0C0
+       .long   0x11ADD0C0
+       .long   0x106360C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x13CF06C2
+       .long   0x11CEF0C0
+       .long   0x13CC7EC2
+       .long   0x11CEF0C0
+       .long   0x11CED8C0
+       .long   0x104268C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13D006C2
+       .long   0x11EFF0C0
+       .long   0x13CD7EC2
+       .long   0x11EFF0C0
+       .long   0x11EF40C0
+       .long   0x102170C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x13D106C2
+       .long   0x1210F0C0
+       .long   0x13CE7EC2
+       .long   0x1210F0C0
+       .long   0x121048C0
+       .long   0x100078C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       .long   0x13D206C2
+       .long   0x1231F0C0
+       .long   0x13CF7EC2
+       .long   0x1231F0C0
+       .long   0x123150C0
+       .long   0x10E780C0
+       vsel    29,6,5,4
+       .long   0x10C6E0C0
+       .long   0x10E7E8C0
+       .long   0x13C4FEC2
+       .long   0x10E7F0C0
+       vxor    29,0,1
+       vsel    29,1,2,29
+       .long   0x106338C0
+       .long   0x13C086C2
+       .long   0x13DEE8C0
+       .long   0x10E7F0C0
+       lvx     28,26,7
+       .long   0x13D306C2
+       .long   0x1252F0C0
+       .long   0x13D07EC2
+       .long   0x1252F0C0
+       .long   0x125258C0
+       .long   0x10C688C0
+       vsel    29,5,4,3
+       .long   0x10A5E0C0
+       .long   0x10C6E8C0
+       .long   0x13C3FEC2
+       .long   0x10C6F0C0
+       vxor    29,7,0
+       vsel    29,0,1,29
+       .long   0x104230C0
+       .long   0x13C786C2
+       .long   0x13DEE8C0
+       .long   0x10C6F0C0
+       lvx     28,27,7
+       .long   0x13D806C2
+       .long   0x1273F0C0
+       .long   0x13D17EC2
+       .long   0x1273F0C0
+       .long   0x127360C0
+       .long   0x10A590C0
+       vsel    29,4,3,2
+       .long   0x1084E0C0
+       .long   0x10A5E8C0
+       .long   0x13C2FEC2
+       .long   0x10A5F0C0
+       vxor    29,6,7
+       vsel    29,7,0,29
+       .long   0x102128C0
+       .long   0x13C686C2
+       .long   0x13DEE8C0
+       .long   0x10A5F0C0
+       lvx     28,28,7
+       .long   0x13D906C2
+       .long   0x1318F0C0
+       .long   0x13D27EC2
+       .long   0x1318F0C0
+       .long   0x131868C0
+       .long   0x108498C0
+       vsel    29,3,2,1
+       .long   0x1063E0C0
+       .long   0x1084E8C0
+       .long   0x13C1FEC2
+       .long   0x1084F0C0
+       vxor    29,5,6
+       vsel    29,6,7,29
+       .long   0x100020C0
+       .long   0x13C586C2
+       .long   0x13DEE8C0
+       .long   0x1084F0C0
+       lvx     28,29,7
+       .long   0x13DA06C2
+       .long   0x1339F0C0
+       .long   0x13D37EC2
+       .long   0x1339F0C0
+       .long   0x133970C0
+       .long   0x1063C0C0
+       vsel    29,2,1,0
+       .long   0x1042E0C0
+       .long   0x1063E8C0
+       .long   0x13C0FEC2
+       .long   0x1063F0C0
+       vxor    29,4,5
+       vsel    29,5,6,29
+       .long   0x10E718C0
+       .long   0x13C486C2
+       .long   0x13DEE8C0
+       .long   0x1063F0C0
+       lvx     28,30,7
+       .long   0x13DB06C2
+       .long   0x135AF0C0
+       .long   0x13D87EC2
+       .long   0x135AF0C0
+       .long   0x135A78C0
+       .long   0x1042C8C0
+       vsel    29,1,0,7
+       .long   0x1021E0C0
+       .long   0x1042E8C0
+       .long   0x13C7FEC2
+       .long   0x1042F0C0
+       vxor    29,3,4
+       vsel    29,4,5,29
+       .long   0x10C610C0
+       .long   0x13C386C2
+       .long   0x13DEE8C0
+       .long   0x1042F0C0
+       lvx     28,31,7
+       addi    7,7,0x80
+       .long   0x13C806C2
+       .long   0x137BF0C0
+       .long   0x13D97EC2
+       .long   0x137BF0C0
+       .long   0x137B80C0
+       .long   0x1021D0C0
+       vsel    29,0,7,6
+       .long   0x1000E0C0
+       .long   0x1021E8C0
+       .long   0x13C6FEC2
+       .long   0x1021F0C0
+       vxor    29,2,3
+       vsel    29,3,4,29
+       .long   0x10A508C0
+       .long   0x13C286C2
+       .long   0x13DEE8C0
+       .long   0x1021F0C0
+       lvx     28,0,7
+       .long   0x13C906C2
+       .long   0x1108F0C0
+       .long   0x13DA7EC2
+       .long   0x1108F0C0
+       .long   0x110888C0
+       .long   0x1000D8C0
+       vsel    29,7,6,5
+       .long   0x10E7E0C0
+       .long   0x1000E8C0
+       .long   0x13C5FEC2
+       .long   0x1000F0C0
+       vxor    29,1,2
+       vsel    29,2,3,29
+       .long   0x108400C0
+       .long   0x13C186C2
+       .long   0x13DEE8C0
+       .long   0x1000F0C0
+       lvx     28,10,7
+       bdnz    .L16_xx
+
+       lvx     10,0,11
+       subic.  5,5,1
+       lvx     11,10,11
+       .long   0x100050C0
+       lvx     12,26,11
+       .long   0x102158C0
+       lvx     13,27,11
+       .long   0x104260C0
+       lvx     14,28,11
+       .long   0x106368C0
+       lvx     15,29,11
+       .long   0x108470C0
+       lvx     16,30,11
+       .long   0x10A578C0
+       lvx     17,31,11
+       .long   0x10C680C0
+       .long   0x10E788C0
+       bne     .Loop
+       vperm   0,0,1,28
+       vperm   2,2,3,28
+       vperm   4,4,5,28
+       vperm   6,6,7,28
+       .long   0x7C001F99
+       .long   0x7C4A1F99
+       .long   0x7C9A1F99
+       .long   0x7CDB1F99
+       addi    11,1,207
+       mtlr    8
+       or      12,12,12
+       lvx     24,0,11
+       lvx     25,10,11
+       lvx     26,26,11
+       lvx     27,27,11
+       lvx     28,28,11
+       lvx     29,29,11
+       lvx     30,30,11
+       lvx     31,31,11
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,6,3,0
+.long  0
+.size  zfs_sha512_power8,.-zfs_sha512_power8
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    6
+       addi    6,6,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0xd728ae22,0x428a2f98
+.long  0xd728ae22,0x428a2f98
+.long  0x23ef65cd,0x71374491
+.long  0x23ef65cd,0x71374491
+.long  0xec4d3b2f,0xb5c0fbcf
+.long  0xec4d3b2f,0xb5c0fbcf
+.long  0x8189dbbc,0xe9b5dba5
+.long  0x8189dbbc,0xe9b5dba5
+.long  0xf348b538,0x3956c25b
+.long  0xf348b538,0x3956c25b
+.long  0xb605d019,0x59f111f1
+.long  0xb605d019,0x59f111f1
+.long  0xaf194f9b,0x923f82a4
+.long  0xaf194f9b,0x923f82a4
+.long  0xda6d8118,0xab1c5ed5
+.long  0xda6d8118,0xab1c5ed5
+.long  0xa3030242,0xd807aa98
+.long  0xa3030242,0xd807aa98
+.long  0x45706fbe,0x12835b01
+.long  0x45706fbe,0x12835b01
+.long  0x4ee4b28c,0x243185be
+.long  0x4ee4b28c,0x243185be
+.long  0xd5ffb4e2,0x550c7dc3
+.long  0xd5ffb4e2,0x550c7dc3
+.long  0xf27b896f,0x72be5d74
+.long  0xf27b896f,0x72be5d74
+.long  0x3b1696b1,0x80deb1fe
+.long  0x3b1696b1,0x80deb1fe
+.long  0x25c71235,0x9bdc06a7
+.long  0x25c71235,0x9bdc06a7
+.long  0xcf692694,0xc19bf174
+.long  0xcf692694,0xc19bf174
+.long  0x9ef14ad2,0xe49b69c1
+.long  0x9ef14ad2,0xe49b69c1
+.long  0x384f25e3,0xefbe4786
+.long  0x384f25e3,0xefbe4786
+.long  0x8b8cd5b5,0x0fc19dc6
+.long  0x8b8cd5b5,0x0fc19dc6
+.long  0x77ac9c65,0x240ca1cc
+.long  0x77ac9c65,0x240ca1cc
+.long  0x592b0275,0x2de92c6f
+.long  0x592b0275,0x2de92c6f
+.long  0x6ea6e483,0x4a7484aa
+.long  0x6ea6e483,0x4a7484aa
+.long  0xbd41fbd4,0x5cb0a9dc
+.long  0xbd41fbd4,0x5cb0a9dc
+.long  0x831153b5,0x76f988da
+.long  0x831153b5,0x76f988da
+.long  0xee66dfab,0x983e5152
+.long  0xee66dfab,0x983e5152
+.long  0x2db43210,0xa831c66d
+.long  0x2db43210,0xa831c66d
+.long  0x98fb213f,0xb00327c8
+.long  0x98fb213f,0xb00327c8
+.long  0xbeef0ee4,0xbf597fc7
+.long  0xbeef0ee4,0xbf597fc7
+.long  0x3da88fc2,0xc6e00bf3
+.long  0x3da88fc2,0xc6e00bf3
+.long  0x930aa725,0xd5a79147
+.long  0x930aa725,0xd5a79147
+.long  0xe003826f,0x06ca6351
+.long  0xe003826f,0x06ca6351
+.long  0x0a0e6e70,0x14292967
+.long  0x0a0e6e70,0x14292967
+.long  0x46d22ffc,0x27b70a85
+.long  0x46d22ffc,0x27b70a85
+.long  0x5c26c926,0x2e1b2138
+.long  0x5c26c926,0x2e1b2138
+.long  0x5ac42aed,0x4d2c6dfc
+.long  0x5ac42aed,0x4d2c6dfc
+.long  0x9d95b3df,0x53380d13
+.long  0x9d95b3df,0x53380d13
+.long  0x8baf63de,0x650a7354
+.long  0x8baf63de,0x650a7354
+.long  0x3c77b2a8,0x766a0abb
+.long  0x3c77b2a8,0x766a0abb
+.long  0x47edaee6,0x81c2c92e
+.long  0x47edaee6,0x81c2c92e
+.long  0x1482353b,0x92722c85
+.long  0x1482353b,0x92722c85
+.long  0x4cf10364,0xa2bfe8a1
+.long  0x4cf10364,0xa2bfe8a1
+.long  0xbc423001,0xa81a664b
+.long  0xbc423001,0xa81a664b
+.long  0xd0f89791,0xc24b8b70
+.long  0xd0f89791,0xc24b8b70
+.long  0x0654be30,0xc76c51a3
+.long  0x0654be30,0xc76c51a3
+.long  0xd6ef5218,0xd192e819
+.long  0xd6ef5218,0xd192e819
+.long  0x5565a910,0xd6990624
+.long  0x5565a910,0xd6990624
+.long  0x5771202a,0xf40e3585
+.long  0x5771202a,0xf40e3585
+.long  0x32bbd1b8,0x106aa070
+.long  0x32bbd1b8,0x106aa070
+.long  0xb8d2d0c8,0x19a4c116
+.long  0xb8d2d0c8,0x19a4c116
+.long  0x5141ab53,0x1e376c08
+.long  0x5141ab53,0x1e376c08
+.long  0xdf8eeb99,0x2748774c
+.long  0xdf8eeb99,0x2748774c
+.long  0xe19b48a8,0x34b0bcb5
+.long  0xe19b48a8,0x34b0bcb5
+.long  0xc5c95a63,0x391c0cb3
+.long  0xc5c95a63,0x391c0cb3
+.long  0xe3418acb,0x4ed8aa4a
+.long  0xe3418acb,0x4ed8aa4a
+.long  0x7763e373,0x5b9cca4f
+.long  0x7763e373,0x5b9cca4f
+.long  0xd6b2b8a3,0x682e6ff3
+.long  0xd6b2b8a3,0x682e6ff3
+.long  0x5defb2fc,0x748f82ee
+.long  0x5defb2fc,0x748f82ee
+.long  0x43172f60,0x78a5636f
+.long  0x43172f60,0x78a5636f
+.long  0xa1f0ab72,0x84c87814
+.long  0xa1f0ab72,0x84c87814
+.long  0x1a6439ec,0x8cc70208
+.long  0x1a6439ec,0x8cc70208
+.long  0x23631e28,0x90befffa
+.long  0x23631e28,0x90befffa
+.long  0xde82bde9,0xa4506ceb
+.long  0xde82bde9,0xa4506ceb
+.long  0xb2c67915,0xbef9a3f7
+.long  0xb2c67915,0xbef9a3f7
+.long  0xe372532b,0xc67178f2
+.long  0xe372532b,0xc67178f2
+.long  0xea26619c,0xca273ece
+.long  0xea26619c,0xca273ece
+.long  0x21c0c207,0xd186b8c7
+.long  0x21c0c207,0xd186b8c7
+.long  0xcde0eb1e,0xeada7dd6
+.long  0xcde0eb1e,0xeada7dd6
+.long  0xee6ed178,0xf57d4f7f
+.long  0xee6ed178,0xf57d4f7f
+.long  0x72176fba,0x06f067aa
+.long  0x72176fba,0x06f067aa
+.long  0xa2c898a6,0x0a637dc5
+.long  0xa2c898a6,0x0a637dc5
+.long  0xbef90dae,0x113f9804
+.long  0xbef90dae,0x113f9804
+.long  0x131c471b,0x1b710b35
+.long  0x131c471b,0x1b710b35
+.long  0x23047d84,0x28db77f5
+.long  0x23047d84,0x28db77f5
+.long  0x40c72493,0x32caab7b
+.long  0x40c72493,0x32caab7b
+.long  0x15c9bebc,0x3c9ebe0a
+.long  0x15c9bebc,0x3c9ebe0a
+.long  0x9c100d4c,0x431d67c4
+.long  0x9c100d4c,0x431d67c4
+.long  0xcb3e42b6,0x4cc5d4be
+.long  0xcb3e42b6,0x4cc5d4be
+.long  0xfc657e2a,0x597f299c
+.long  0xfc657e2a,0x597f299c
+.long  0x3ad6faec,0x5fcb6fab
+.long  0x3ad6faec,0x5fcb6fab
+.long  0x4a475817,0x6c44198c
+.long  0x4a475817,0x6c44198c
+.long  0,0
+.long  0,0
+.long  0x14151617,0x10111213
+.long  0x04050607,0x00010203
+
+#endif
diff --git a/module/icp/asm-ppc64/sha2/sha512-ppc.S b/module/icp/asm-ppc64/sha2/sha512-ppc.S
new file mode 100644 (file)
index 0000000..3707011
--- /dev/null
@@ -0,0 +1,2958 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if (defined(__PPC64__) && defined(__BIG_ENDIAN__))
+
+.text
+
+.globl zfs_sha512_ppc
+.globl .zfs_sha512_ppc
+.type  zfs_sha512_ppc,@function
+.section       ".opd","aw"
+.align 3
+zfs_sha512_ppc:
+.quad  .zfs_sha512_ppc,.TOC.@tocbase,0
+.previous
+.align 6
+.zfs_sha512_ppc:
+       stdu    1,-384(1)
+       mflr    0
+       sldi    5,5,7
+
+       std     3,208(1)
+
+       std     14,240(1)
+       std     15,248(1)
+       std     16,256(1)
+       std     17,264(1)
+       std     18,272(1)
+       std     19,280(1)
+       std     20,288(1)
+       std     21,296(1)
+       std     22,304(1)
+       std     23,312(1)
+       std     24,320(1)
+       std     25,328(1)
+       std     26,336(1)
+       std     27,344(1)
+       std     28,352(1)
+       std     29,360(1)
+       std     30,368(1)
+       std     31,376(1)
+       std     0,400(1)
+       ld      8,0(3)
+       mr      31,4
+       ld      9,8(3)
+       ld      10,16(3)
+       ld      11,24(3)
+       ld      12,32(3)
+       ld      6,40(3)
+       ld      14,48(3)
+       ld      15,56(3)
+       bl      .LPICmeup
+.LPICedup:
+       andi.   0,31,3
+       bne     .Lunaligned
+.Laligned:
+       add     5,31,5
+       std     5,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+       b       .Ldone
+
+
+
+
+
+
+
+.align 4
+.Lunaligned:
+       subfic  0,31,4096
+       andi.   0,0,3968
+       beq     .Lcross_page
+       cmpld   5,0
+       ble     .Laligned
+       subfc   5,0,5
+       add     0,31,0
+       std     5,184(1)
+       std     0,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+
+       ld      5,184(1)
+.Lcross_page:
+       li      0,32
+       mtctr   0
+       addi    20,1,48
+.Lmemcpy:
+       lbz     16,0(31)
+       lbz     17,1(31)
+       lbz     18,2(31)
+       lbz     19,3(31)
+       addi    31,31,4
+       stb     16,0(20)
+       stb     17,1(20)
+       stb     18,2(20)
+       stb     19,3(20)
+       addi    20,20,4
+       bdnz    .Lmemcpy
+       std     31,176(1)
+       addi    0,1,176
+       addi    31,1,48
+       std     5,184(1)
+       std     0,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+       ld      31,176(1)
+       ld      5,184(1)
+       addic.  5,5,-128
+       bne     .Lunaligned
+
+.Ldone:
+       ld      0,400(1)
+       ld      14,240(1)
+       ld      15,248(1)
+       ld      16,256(1)
+       ld      17,264(1)
+       ld      18,272(1)
+       ld      19,280(1)
+       ld      20,288(1)
+       ld      21,296(1)
+       ld      22,304(1)
+       ld      23,312(1)
+       ld      24,320(1)
+       ld      25,328(1)
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       mtlr    0
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,18,3,0
+.long  0
+.align 4
+.Lsha2_block_private:
+       ld      0,0(7)
+       lwz     5,0(31)
+       lwz     16,4(31)
+       insrdi  16,5,32,0
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       ld      0,8(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     5,8(31)
+       lwz     17,12(31)
+       insrdi  17,5,32,0
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       ld      0,16(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     5,16(31)
+       lwz     18,20(31)
+       insrdi  18,5,32,0
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       ld      0,24(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     5,24(31)
+       lwz     19,28(31)
+       insrdi  19,5,32,0
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       ld      0,32(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     5,32(31)
+       lwz     20,36(31)
+       insrdi  20,5,32,0
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       ld      0,40(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     5,40(31)
+       lwz     21,44(31)
+       insrdi  21,5,32,0
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       ld      0,48(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     5,48(31)
+       lwz     22,52(31)
+       insrdi  22,5,32,0
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       ld      0,56(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     5,56(31)
+       lwz     23,60(31)
+       insrdi  23,5,32,0
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       ld      0,64(7)
+       add     8,8,3
+       add     8,8,5
+
+       lwz     5,64(31)
+       lwz     24,68(31)
+       insrdi  24,5,32,0
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       ld      0,72(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     5,72(31)
+       lwz     25,76(31)
+       insrdi  25,5,32,0
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       ld      0,80(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     5,80(31)
+       lwz     26,84(31)
+       insrdi  26,5,32,0
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       ld      0,88(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     5,88(31)
+       lwz     27,92(31)
+       insrdi  27,5,32,0
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       ld      0,96(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     5,96(31)
+       lwz     28,100(31)
+       insrdi  28,5,32,0
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       ld      0,104(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     5,104(31)
+       lwz     29,108(31)
+       insrdi  29,5,32,0
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       ld      0,112(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     5,112(31)
+       lwz     30,116(31)
+       insrdi  30,5,32,0
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       ld      0,120(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     5,120(31)
+       lwz     31,124(31)
+       insrdi  31,5,32,0
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       li      5,4
+       mtctr   5
+.align 4
+.Lrounds:
+       addi    7,7,128
+       rotrdi  3,17,1
+       rotrdi  4,17,8
+       rotrdi  5,30,19
+       rotrdi  0,30,61
+       xor     3,3,4
+       srdi    4,17,7
+       xor     5,5,0
+       srdi    0,30,6
+       add     16,16,25
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,0(7)
+       add     16,16,3
+       add     16,16,5
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrdi  3,18,1
+       rotrdi  4,18,8
+       rotrdi  5,31,19
+       rotrdi  0,31,61
+       xor     3,3,4
+       srdi    4,18,7
+       xor     5,5,0
+       srdi    0,31,6
+       add     17,17,26
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,8(7)
+       add     17,17,3
+       add     17,17,5
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrdi  3,19,1
+       rotrdi  4,19,8
+       rotrdi  5,16,19
+       rotrdi  0,16,61
+       xor     3,3,4
+       srdi    4,19,7
+       xor     5,5,0
+       srdi    0,16,6
+       add     18,18,27
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,16(7)
+       add     18,18,3
+       add     18,18,5
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrdi  3,20,1
+       rotrdi  4,20,8
+       rotrdi  5,17,19
+       rotrdi  0,17,61
+       xor     3,3,4
+       srdi    4,20,7
+       xor     5,5,0
+       srdi    0,17,6
+       add     19,19,28
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,24(7)
+       add     19,19,3
+       add     19,19,5
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrdi  3,21,1
+       rotrdi  4,21,8
+       rotrdi  5,18,19
+       rotrdi  0,18,61
+       xor     3,3,4
+       srdi    4,21,7
+       xor     5,5,0
+       srdi    0,18,6
+       add     20,20,29
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,32(7)
+       add     20,20,3
+       add     20,20,5
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrdi  3,22,1
+       rotrdi  4,22,8
+       rotrdi  5,19,19
+       rotrdi  0,19,61
+       xor     3,3,4
+       srdi    4,22,7
+       xor     5,5,0
+       srdi    0,19,6
+       add     21,21,30
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,40(7)
+       add     21,21,3
+       add     21,21,5
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrdi  3,23,1
+       rotrdi  4,23,8
+       rotrdi  5,20,19
+       rotrdi  0,20,61
+       xor     3,3,4
+       srdi    4,23,7
+       xor     5,5,0
+       srdi    0,20,6
+       add     22,22,31
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,48(7)
+       add     22,22,3
+       add     22,22,5
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrdi  3,24,1
+       rotrdi  4,24,8
+       rotrdi  5,21,19
+       rotrdi  0,21,61
+       xor     3,3,4
+       srdi    4,24,7
+       xor     5,5,0
+       srdi    0,21,6
+       add     23,23,16
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,56(7)
+       add     23,23,3
+       add     23,23,5
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       rotrdi  3,25,1
+       rotrdi  4,25,8
+       rotrdi  5,22,19
+       rotrdi  0,22,61
+       xor     3,3,4
+       srdi    4,25,7
+       xor     5,5,0
+       srdi    0,22,6
+       add     24,24,17
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,64(7)
+       add     24,24,3
+       add     24,24,5
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrdi  3,26,1
+       rotrdi  4,26,8
+       rotrdi  5,23,19
+       rotrdi  0,23,61
+       xor     3,3,4
+       srdi    4,26,7
+       xor     5,5,0
+       srdi    0,23,6
+       add     25,25,18
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,72(7)
+       add     25,25,3
+       add     25,25,5
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrdi  3,27,1
+       rotrdi  4,27,8
+       rotrdi  5,24,19
+       rotrdi  0,24,61
+       xor     3,3,4
+       srdi    4,27,7
+       xor     5,5,0
+       srdi    0,24,6
+       add     26,26,19
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,80(7)
+       add     26,26,3
+       add     26,26,5
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrdi  3,28,1
+       rotrdi  4,28,8
+       rotrdi  5,25,19
+       rotrdi  0,25,61
+       xor     3,3,4
+       srdi    4,28,7
+       xor     5,5,0
+       srdi    0,25,6
+       add     27,27,20
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,88(7)
+       add     27,27,3
+       add     27,27,5
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrdi  3,29,1
+       rotrdi  4,29,8
+       rotrdi  5,26,19
+       rotrdi  0,26,61
+       xor     3,3,4
+       srdi    4,29,7
+       xor     5,5,0
+       srdi    0,26,6
+       add     28,28,21
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,96(7)
+       add     28,28,3
+       add     28,28,5
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrdi  3,30,1
+       rotrdi  4,30,8
+       rotrdi  5,27,19
+       rotrdi  0,27,61
+       xor     3,3,4
+       srdi    4,30,7
+       xor     5,5,0
+       srdi    0,27,6
+       add     29,29,22
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,104(7)
+       add     29,29,3
+       add     29,29,5
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrdi  3,31,1
+       rotrdi  4,31,8
+       rotrdi  5,28,19
+       rotrdi  0,28,61
+       xor     3,3,4
+       srdi    4,31,7
+       xor     5,5,0
+       srdi    0,28,6
+       add     30,30,23
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,112(7)
+       add     30,30,3
+       add     30,30,5
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrdi  3,16,1
+       rotrdi  4,16,8
+       rotrdi  5,29,19
+       rotrdi  0,29,61
+       xor     3,3,4
+       srdi    4,16,7
+       xor     5,5,0
+       srdi    0,29,6
+       add     31,31,24
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,120(7)
+       add     31,31,3
+       add     31,31,5
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       bdnz    .Lrounds
+
+       ld      3,208(1)
+       ld      31,200(1)
+       ld      5,192(1)
+       subi    7,7,512
+
+       ld      16,0(3)
+       ld      17,8(3)
+       ld      18,16(3)
+       ld      19,24(3)
+       ld      20,32(3)
+       ld      21,40(3)
+       ld      22,48(3)
+       addi    31,31,128
+       ld      23,56(3)
+       add     8,8,16
+       add     9,9,17
+       std     31,200(1)
+       add     10,10,18
+       std     8,0(3)
+       add     11,11,19
+       std     9,8(3)
+       add     12,12,20
+       std     10,16(3)
+       add     6,6,21
+       std     11,24(3)
+       add     14,14,22
+       std     12,32(3)
+       add     15,15,23
+       std     6,40(3)
+       std     14,48(3)
+       cmpld   31,5
+       std     15,56(3)
+       bne     .Lsha2_block_private
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.size  .zfs_sha512_ppc,.-.zfs_sha512_ppc
+.size  zfs_sha512_ppc,.-.zfs_sha512_ppc
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    7
+       addi    7,7,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0x428a2f98,0xd728ae22
+.long  0x71374491,0x23ef65cd
+.long  0xb5c0fbcf,0xec4d3b2f
+.long  0xe9b5dba5,0x8189dbbc
+.long  0x3956c25b,0xf348b538
+.long  0x59f111f1,0xb605d019
+.long  0x923f82a4,0xaf194f9b
+.long  0xab1c5ed5,0xda6d8118
+.long  0xd807aa98,0xa3030242
+.long  0x12835b01,0x45706fbe
+.long  0x243185be,0x4ee4b28c
+.long  0x550c7dc3,0xd5ffb4e2
+.long  0x72be5d74,0xf27b896f
+.long  0x80deb1fe,0x3b1696b1
+.long  0x9bdc06a7,0x25c71235
+.long  0xc19bf174,0xcf692694
+.long  0xe49b69c1,0x9ef14ad2
+.long  0xefbe4786,0x384f25e3
+.long  0x0fc19dc6,0x8b8cd5b5
+.long  0x240ca1cc,0x77ac9c65
+.long  0x2de92c6f,0x592b0275
+.long  0x4a7484aa,0x6ea6e483
+.long  0x5cb0a9dc,0xbd41fbd4
+.long  0x76f988da,0x831153b5
+.long  0x983e5152,0xee66dfab
+.long  0xa831c66d,0x2db43210
+.long  0xb00327c8,0x98fb213f
+.long  0xbf597fc7,0xbeef0ee4
+.long  0xc6e00bf3,0x3da88fc2
+.long  0xd5a79147,0x930aa725
+.long  0x06ca6351,0xe003826f
+.long  0x14292967,0x0a0e6e70
+.long  0x27b70a85,0x46d22ffc
+.long  0x2e1b2138,0x5c26c926
+.long  0x4d2c6dfc,0x5ac42aed
+.long  0x53380d13,0x9d95b3df
+.long  0x650a7354,0x8baf63de
+.long  0x766a0abb,0x3c77b2a8
+.long  0x81c2c92e,0x47edaee6
+.long  0x92722c85,0x1482353b
+.long  0xa2bfe8a1,0x4cf10364
+.long  0xa81a664b,0xbc423001
+.long  0xc24b8b70,0xd0f89791
+.long  0xc76c51a3,0x0654be30
+.long  0xd192e819,0xd6ef5218
+.long  0xd6990624,0x5565a910
+.long  0xf40e3585,0x5771202a
+.long  0x106aa070,0x32bbd1b8
+.long  0x19a4c116,0xb8d2d0c8
+.long  0x1e376c08,0x5141ab53
+.long  0x2748774c,0xdf8eeb99
+.long  0x34b0bcb5,0xe19b48a8
+.long  0x391c0cb3,0xc5c95a63
+.long  0x4ed8aa4a,0xe3418acb
+.long  0x5b9cca4f,0x7763e373
+.long  0x682e6ff3,0xd6b2b8a3
+.long  0x748f82ee,0x5defb2fc
+.long  0x78a5636f,0x43172f60
+.long  0x84c87814,0xa1f0ab72
+.long  0x8cc70208,0x1a6439ec
+.long  0x90befffa,0x23631e28
+.long  0xa4506ceb,0xde82bde9
+.long  0xbef9a3f7,0xb2c67915
+.long  0xc67178f2,0xe372532b
+.long  0xca273ece,0xea26619c
+.long  0xd186b8c7,0x21c0c207
+.long  0xeada7dd6,0xcde0eb1e
+.long  0xf57d4f7f,0xee6ed178
+.long  0x06f067aa,0x72176fba
+.long  0x0a637dc5,0xa2c898a6
+.long  0x113f9804,0xbef90dae
+.long  0x1b710b35,0x131c471b
+.long  0x28db77f5,0x23047d84
+.long  0x32caab7b,0x40c72493
+.long  0x3c9ebe0a,0x15c9bebc
+.long  0x431d67c4,0x9c100d4c
+.long  0x4cc5d4be,0xcb3e42b6
+.long  0x597f299c,0xfc657e2a
+.long  0x5fcb6fab,0x3ad6faec
+.long  0x6c44198c,0x4a475817
+
+#elif (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+.abiversion    2
+.text
+
+.globl zfs_sha512_ppc
+.type  zfs_sha512_ppc,@function
+.align 6
+zfs_sha512_ppc:
+.localentry    zfs_sha512_ppc,0
+
+       stdu    1,-384(1)
+       mflr    0
+       sldi    5,5,7
+
+       std     3,208(1)
+
+       std     14,240(1)
+       std     15,248(1)
+       std     16,256(1)
+       std     17,264(1)
+       std     18,272(1)
+       std     19,280(1)
+       std     20,288(1)
+       std     21,296(1)
+       std     22,304(1)
+       std     23,312(1)
+       std     24,320(1)
+       std     25,328(1)
+       std     26,336(1)
+       std     27,344(1)
+       std     28,352(1)
+       std     29,360(1)
+       std     30,368(1)
+       std     31,376(1)
+       std     0,400(1)
+       ld      8,0(3)
+       mr      31,4
+       ld      9,8(3)
+       ld      10,16(3)
+       ld      11,24(3)
+       ld      12,32(3)
+       ld      6,40(3)
+       ld      14,48(3)
+       ld      15,56(3)
+       bl      .LPICmeup
+.LPICedup:
+       andi.   0,31,3
+       bne     .Lunaligned
+.Laligned:
+       add     5,31,5
+       std     5,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+       b       .Ldone
+
+.align 4
+.Lunaligned:
+       subfic  0,31,4096
+       andi.   0,0,3968
+       beq     .Lcross_page
+       cmpld   5,0
+       ble     .Laligned
+       subfc   5,0,5
+       add     0,31,0
+       std     5,184(1)
+       std     0,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+
+       ld      5,184(1)
+.Lcross_page:
+       li      0,32
+       mtctr   0
+       addi    20,1,48
+.Lmemcpy:
+       lbz     16,0(31)
+       lbz     17,1(31)
+       lbz     18,2(31)
+       lbz     19,3(31)
+       addi    31,31,4
+       stb     16,0(20)
+       stb     17,1(20)
+       stb     18,2(20)
+       stb     19,3(20)
+       addi    20,20,4
+       bdnz    .Lmemcpy
+       std     31,176(1)
+       addi    0,1,176
+       addi    31,1,48
+       std     5,184(1)
+       std     0,192(1)
+       std     31,200(1)
+       bl      .Lsha2_block_private
+       ld      31,176(1)
+       ld      5,184(1)
+       addic.  5,5,-128
+       bne     .Lunaligned
+
+.Ldone:
+       ld      0,400(1)
+       ld      14,240(1)
+       ld      15,248(1)
+       ld      16,256(1)
+       ld      17,264(1)
+       ld      18,272(1)
+       ld      19,280(1)
+       ld      20,288(1)
+       ld      21,296(1)
+       ld      22,304(1)
+       ld      23,312(1)
+       ld      24,320(1)
+       ld      25,328(1)
+       ld      26,336(1)
+       ld      27,344(1)
+       ld      28,352(1)
+       ld      29,360(1)
+       ld      30,368(1)
+       ld      31,376(1)
+       mtlr    0
+       addi    1,1,384
+       blr     
+.long  0
+.byte  0,12,4,1,0x80,18,3,0
+.long  0
+.align 4
+.Lsha2_block_private:
+       ld      0,0(7)
+       lwz     3,0(31)
+       lwz     4,4(31)
+       rotlwi  5,3,8
+       rotlwi  16,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  16,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  16,4,24,16,23
+       insrdi  16,5,32,0
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       ld      0,8(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     3,8(31)
+       lwz     4,12(31)
+       rotlwi  5,3,8
+       rotlwi  17,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  17,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  17,4,24,16,23
+       insrdi  17,5,32,0
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       ld      0,16(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     3,16(31)
+       lwz     4,20(31)
+       rotlwi  5,3,8
+       rotlwi  18,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  18,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  18,4,24,16,23
+       insrdi  18,5,32,0
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       ld      0,24(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     3,24(31)
+       lwz     4,28(31)
+       rotlwi  5,3,8
+       rotlwi  19,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  19,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  19,4,24,16,23
+       insrdi  19,5,32,0
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       ld      0,32(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     3,32(31)
+       lwz     4,36(31)
+       rotlwi  5,3,8
+       rotlwi  20,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  20,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  20,4,24,16,23
+       insrdi  20,5,32,0
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       ld      0,40(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     3,40(31)
+       lwz     4,44(31)
+       rotlwi  5,3,8
+       rotlwi  21,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  21,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  21,4,24,16,23
+       insrdi  21,5,32,0
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       ld      0,48(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     3,48(31)
+       lwz     4,52(31)
+       rotlwi  5,3,8
+       rotlwi  22,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  22,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  22,4,24,16,23
+       insrdi  22,5,32,0
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       ld      0,56(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     3,56(31)
+       lwz     4,60(31)
+       rotlwi  5,3,8
+       rotlwi  23,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  23,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  23,4,24,16,23
+       insrdi  23,5,32,0
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       ld      0,64(7)
+       add     8,8,3
+       add     8,8,5
+
+       lwz     3,64(31)
+       lwz     4,68(31)
+       rotlwi  5,3,8
+       rotlwi  24,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  24,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  24,4,24,16,23
+       insrdi  24,5,32,0
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       ld      0,72(7)
+       add     15,15,3
+       add     15,15,5
+
+       lwz     3,72(31)
+       lwz     4,76(31)
+       rotlwi  5,3,8
+       rotlwi  25,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  25,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  25,4,24,16,23
+       insrdi  25,5,32,0
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       ld      0,80(7)
+       add     14,14,3
+       add     14,14,5
+
+       lwz     3,80(31)
+       lwz     4,84(31)
+       rotlwi  5,3,8
+       rotlwi  26,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  26,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  26,4,24,16,23
+       insrdi  26,5,32,0
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       ld      0,88(7)
+       add     6,6,3
+       add     6,6,5
+
+       lwz     3,88(31)
+       lwz     4,92(31)
+       rotlwi  5,3,8
+       rotlwi  27,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  27,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  27,4,24,16,23
+       insrdi  27,5,32,0
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       ld      0,96(7)
+       add     12,12,3
+       add     12,12,5
+
+       lwz     3,96(31)
+       lwz     4,100(31)
+       rotlwi  5,3,8
+       rotlwi  28,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  28,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  28,4,24,16,23
+       insrdi  28,5,32,0
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       ld      0,104(7)
+       add     11,11,3
+       add     11,11,5
+
+       lwz     3,104(31)
+       lwz     4,108(31)
+       rotlwi  5,3,8
+       rotlwi  29,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  29,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  29,4,24,16,23
+       insrdi  29,5,32,0
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       ld      0,112(7)
+       add     10,10,3
+       add     10,10,5
+
+       lwz     3,112(31)
+       lwz     4,116(31)
+       rotlwi  5,3,8
+       rotlwi  30,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  30,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  30,4,24,16,23
+       insrdi  30,5,32,0
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       ld      0,120(7)
+       add     9,9,3
+       add     9,9,5
+
+       lwz     3,120(31)
+       lwz     4,124(31)
+       rotlwi  5,3,8
+       rotlwi  31,4,8
+       rlwimi  5,3,24,0,7
+       rlwimi  31,4,24,0,7
+       rlwimi  5,3,24,16,23
+       rlwimi  31,4,24,16,23
+       insrdi  31,5,32,0
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       li      5,4
+       mtctr   5
+.align 4
+.Lrounds:
+       addi    7,7,128
+       rotrdi  3,17,1
+       rotrdi  4,17,8
+       rotrdi  5,30,19
+       rotrdi  0,30,61
+       xor     3,3,4
+       srdi    4,17,7
+       xor     5,5,0
+       srdi    0,30,6
+       add     16,16,25
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,0(7)
+       add     16,16,3
+       add     16,16,5
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,16
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrdi  3,18,1
+       rotrdi  4,18,8
+       rotrdi  5,31,19
+       rotrdi  0,31,61
+       xor     3,3,4
+       srdi    4,18,7
+       xor     5,5,0
+       srdi    0,31,6
+       add     17,17,26
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,8(7)
+       add     17,17,3
+       add     17,17,5
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,17
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrdi  3,19,1
+       rotrdi  4,19,8
+       rotrdi  5,16,19
+       rotrdi  0,16,61
+       xor     3,3,4
+       srdi    4,19,7
+       xor     5,5,0
+       srdi    0,16,6
+       add     18,18,27
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,16(7)
+       add     18,18,3
+       add     18,18,5
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,18
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrdi  3,20,1
+       rotrdi  4,20,8
+       rotrdi  5,17,19
+       rotrdi  0,17,61
+       xor     3,3,4
+       srdi    4,20,7
+       xor     5,5,0
+       srdi    0,17,6
+       add     19,19,28
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,24(7)
+       add     19,19,3
+       add     19,19,5
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,19
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrdi  3,21,1
+       rotrdi  4,21,8
+       rotrdi  5,18,19
+       rotrdi  0,18,61
+       xor     3,3,4
+       srdi    4,21,7
+       xor     5,5,0
+       srdi    0,18,6
+       add     20,20,29
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,32(7)
+       add     20,20,3
+       add     20,20,5
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,20
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrdi  3,22,1
+       rotrdi  4,22,8
+       rotrdi  5,19,19
+       rotrdi  0,19,61
+       xor     3,3,4
+       srdi    4,22,7
+       xor     5,5,0
+       srdi    0,19,6
+       add     21,21,30
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,40(7)
+       add     21,21,3
+       add     21,21,5
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,21
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrdi  3,23,1
+       rotrdi  4,23,8
+       rotrdi  5,20,19
+       rotrdi  0,20,61
+       xor     3,3,4
+       srdi    4,23,7
+       xor     5,5,0
+       srdi    0,20,6
+       add     22,22,31
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,48(7)
+       add     22,22,3
+       add     22,22,5
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,22
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrdi  3,24,1
+       rotrdi  4,24,8
+       rotrdi  5,21,19
+       rotrdi  0,21,61
+       xor     3,3,4
+       srdi    4,24,7
+       xor     5,5,0
+       srdi    0,21,6
+       add     23,23,16
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,56(7)
+       add     23,23,3
+       add     23,23,5
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,23
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       rotrdi  3,25,1
+       rotrdi  4,25,8
+       rotrdi  5,22,19
+       rotrdi  0,22,61
+       xor     3,3,4
+       srdi    4,25,7
+       xor     5,5,0
+       srdi    0,22,6
+       add     24,24,17
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,64(7)
+       add     24,24,3
+       add     24,24,5
+       rotrdi  3,12,14
+       rotrdi  4,12,18
+       and     5,6,12
+       xor     3,3,4
+       add     15,15,0
+       andc    0,14,12
+       rotrdi  4,4,23
+       or      5,5,0
+       add     15,15,24
+       xor     3,3,4
+       add     15,15,5
+       add     15,15,3
+
+       rotrdi  3,8,28
+       rotrdi  4,8,34
+       and     5,8,9
+       and     0,8,10
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,9,10
+       xor     3,3,4
+       add     11,11,15
+       xor     5,5,0
+       add     15,15,3
+       add     15,15,5
+
+       rotrdi  3,26,1
+       rotrdi  4,26,8
+       rotrdi  5,23,19
+       rotrdi  0,23,61
+       xor     3,3,4
+       srdi    4,26,7
+       xor     5,5,0
+       srdi    0,23,6
+       add     25,25,18
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,72(7)
+       add     25,25,3
+       add     25,25,5
+       rotrdi  3,11,14
+       rotrdi  4,11,18
+       and     5,12,11
+       xor     3,3,4
+       add     14,14,0
+       andc    0,6,11
+       rotrdi  4,4,23
+       or      5,5,0
+       add     14,14,25
+       xor     3,3,4
+       add     14,14,5
+       add     14,14,3
+
+       rotrdi  3,15,28
+       rotrdi  4,15,34
+       and     5,15,8
+       and     0,15,9
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,8,9
+       xor     3,3,4
+       add     10,10,14
+       xor     5,5,0
+       add     14,14,3
+       add     14,14,5
+
+       rotrdi  3,27,1
+       rotrdi  4,27,8
+       rotrdi  5,24,19
+       rotrdi  0,24,61
+       xor     3,3,4
+       srdi    4,27,7
+       xor     5,5,0
+       srdi    0,24,6
+       add     26,26,19
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,80(7)
+       add     26,26,3
+       add     26,26,5
+       rotrdi  3,10,14
+       rotrdi  4,10,18
+       and     5,11,10
+       xor     3,3,4
+       add     6,6,0
+       andc    0,12,10
+       rotrdi  4,4,23
+       or      5,5,0
+       add     6,6,26
+       xor     3,3,4
+       add     6,6,5
+       add     6,6,3
+
+       rotrdi  3,14,28
+       rotrdi  4,14,34
+       and     5,14,15
+       and     0,14,8
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,15,8
+       xor     3,3,4
+       add     9,9,6
+       xor     5,5,0
+       add     6,6,3
+       add     6,6,5
+
+       rotrdi  3,28,1
+       rotrdi  4,28,8
+       rotrdi  5,25,19
+       rotrdi  0,25,61
+       xor     3,3,4
+       srdi    4,28,7
+       xor     5,5,0
+       srdi    0,25,6
+       add     27,27,20
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,88(7)
+       add     27,27,3
+       add     27,27,5
+       rotrdi  3,9,14
+       rotrdi  4,9,18
+       and     5,10,9
+       xor     3,3,4
+       add     12,12,0
+       andc    0,11,9
+       rotrdi  4,4,23
+       or      5,5,0
+       add     12,12,27
+       xor     3,3,4
+       add     12,12,5
+       add     12,12,3
+
+       rotrdi  3,6,28
+       rotrdi  4,6,34
+       and     5,6,14
+       and     0,6,15
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,14,15
+       xor     3,3,4
+       add     8,8,12
+       xor     5,5,0
+       add     12,12,3
+       add     12,12,5
+
+       rotrdi  3,29,1
+       rotrdi  4,29,8
+       rotrdi  5,26,19
+       rotrdi  0,26,61
+       xor     3,3,4
+       srdi    4,29,7
+       xor     5,5,0
+       srdi    0,26,6
+       add     28,28,21
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,96(7)
+       add     28,28,3
+       add     28,28,5
+       rotrdi  3,8,14
+       rotrdi  4,8,18
+       and     5,9,8
+       xor     3,3,4
+       add     11,11,0
+       andc    0,10,8
+       rotrdi  4,4,23
+       or      5,5,0
+       add     11,11,28
+       xor     3,3,4
+       add     11,11,5
+       add     11,11,3
+
+       rotrdi  3,12,28
+       rotrdi  4,12,34
+       and     5,12,6
+       and     0,12,14
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,6,14
+       xor     3,3,4
+       add     15,15,11
+       xor     5,5,0
+       add     11,11,3
+       add     11,11,5
+
+       rotrdi  3,30,1
+       rotrdi  4,30,8
+       rotrdi  5,27,19
+       rotrdi  0,27,61
+       xor     3,3,4
+       srdi    4,30,7
+       xor     5,5,0
+       srdi    0,27,6
+       add     29,29,22
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,104(7)
+       add     29,29,3
+       add     29,29,5
+       rotrdi  3,15,14
+       rotrdi  4,15,18
+       and     5,8,15
+       xor     3,3,4
+       add     10,10,0
+       andc    0,9,15
+       rotrdi  4,4,23
+       or      5,5,0
+       add     10,10,29
+       xor     3,3,4
+       add     10,10,5
+       add     10,10,3
+
+       rotrdi  3,11,28
+       rotrdi  4,11,34
+       and     5,11,12
+       and     0,11,6
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,12,6
+       xor     3,3,4
+       add     14,14,10
+       xor     5,5,0
+       add     10,10,3
+       add     10,10,5
+
+       rotrdi  3,31,1
+       rotrdi  4,31,8
+       rotrdi  5,28,19
+       rotrdi  0,28,61
+       xor     3,3,4
+       srdi    4,31,7
+       xor     5,5,0
+       srdi    0,28,6
+       add     30,30,23
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,112(7)
+       add     30,30,3
+       add     30,30,5
+       rotrdi  3,14,14
+       rotrdi  4,14,18
+       and     5,15,14
+       xor     3,3,4
+       add     9,9,0
+       andc    0,8,14
+       rotrdi  4,4,23
+       or      5,5,0
+       add     9,9,30
+       xor     3,3,4
+       add     9,9,5
+       add     9,9,3
+
+       rotrdi  3,10,28
+       rotrdi  4,10,34
+       and     5,10,11
+       and     0,10,12
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,11,12
+       xor     3,3,4
+       add     6,6,9
+       xor     5,5,0
+       add     9,9,3
+       add     9,9,5
+
+       rotrdi  3,16,1
+       rotrdi  4,16,8
+       rotrdi  5,29,19
+       rotrdi  0,29,61
+       xor     3,3,4
+       srdi    4,16,7
+       xor     5,5,0
+       srdi    0,29,6
+       add     31,31,24
+       xor     3,3,4
+       xor     5,5,0
+       ld      0,120(7)
+       add     31,31,3
+       add     31,31,5
+       rotrdi  3,6,14
+       rotrdi  4,6,18
+       and     5,14,6
+       xor     3,3,4
+       add     8,8,0
+       andc    0,15,6
+       rotrdi  4,4,23
+       or      5,5,0
+       add     8,8,31
+       xor     3,3,4
+       add     8,8,5
+       add     8,8,3
+
+       rotrdi  3,9,28
+       rotrdi  4,9,34
+       and     5,9,10
+       and     0,9,11
+       xor     3,3,4
+       rotrdi  4,4,5
+       xor     5,5,0
+       and     0,10,11
+       xor     3,3,4
+       add     12,12,8
+       xor     5,5,0
+       add     8,8,3
+       add     8,8,5
+
+       bdnz    .Lrounds
+
+       ld      3,208(1)
+       ld      31,200(1)
+       ld      5,192(1)
+       subi    7,7,512
+
+       ld      16,0(3)
+       ld      17,8(3)
+       ld      18,16(3)
+       ld      19,24(3)
+       ld      20,32(3)
+       ld      21,40(3)
+       ld      22,48(3)
+       addi    31,31,128
+       ld      23,56(3)
+       add     8,8,16
+       add     9,9,17
+       std     31,200(1)
+       add     10,10,18
+       std     8,0(3)
+       add     11,11,19
+       std     9,8(3)
+       add     12,12,20
+       std     10,16(3)
+       add     6,6,21
+       std     11,24(3)
+       add     14,14,22
+       std     12,32(3)
+       add     15,15,23
+       std     6,40(3)
+       std     14,48(3)
+       cmpld   31,5
+       std     15,56(3)
+       bne     .Lsha2_block_private
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.size  zfs_sha512_ppc,.-zfs_sha512_ppc
+.align 6
+.LPICmeup:
+       mflr    0
+       bcl     20,31,$+4
+       mflr    7
+       addi    7,7,56
+       mtlr    0
+       blr     
+.long  0
+.byte  0,12,0x14,0,0,0,0,0
+.space 28
+.long  0xd728ae22,0x428a2f98
+.long  0x23ef65cd,0x71374491
+.long  0xec4d3b2f,0xb5c0fbcf
+.long  0x8189dbbc,0xe9b5dba5
+.long  0xf348b538,0x3956c25b
+.long  0xb605d019,0x59f111f1
+.long  0xaf194f9b,0x923f82a4
+.long  0xda6d8118,0xab1c5ed5
+.long  0xa3030242,0xd807aa98
+.long  0x45706fbe,0x12835b01
+.long  0x4ee4b28c,0x243185be
+.long  0xd5ffb4e2,0x550c7dc3
+.long  0xf27b896f,0x72be5d74
+.long  0x3b1696b1,0x80deb1fe
+.long  0x25c71235,0x9bdc06a7
+.long  0xcf692694,0xc19bf174
+.long  0x9ef14ad2,0xe49b69c1
+.long  0x384f25e3,0xefbe4786
+.long  0x8b8cd5b5,0x0fc19dc6
+.long  0x77ac9c65,0x240ca1cc
+.long  0x592b0275,0x2de92c6f
+.long  0x6ea6e483,0x4a7484aa
+.long  0xbd41fbd4,0x5cb0a9dc
+.long  0x831153b5,0x76f988da
+.long  0xee66dfab,0x983e5152
+.long  0x2db43210,0xa831c66d
+.long  0x98fb213f,0xb00327c8
+.long  0xbeef0ee4,0xbf597fc7
+.long  0x3da88fc2,0xc6e00bf3
+.long  0x930aa725,0xd5a79147
+.long  0xe003826f,0x06ca6351
+.long  0x0a0e6e70,0x14292967
+.long  0x46d22ffc,0x27b70a85
+.long  0x5c26c926,0x2e1b2138
+.long  0x5ac42aed,0x4d2c6dfc
+.long  0x9d95b3df,0x53380d13
+.long  0x8baf63de,0x650a7354
+.long  0x3c77b2a8,0x766a0abb
+.long  0x47edaee6,0x81c2c92e
+.long  0x1482353b,0x92722c85
+.long  0x4cf10364,0xa2bfe8a1
+.long  0xbc423001,0xa81a664b
+.long  0xd0f89791,0xc24b8b70
+.long  0x0654be30,0xc76c51a3
+.long  0xd6ef5218,0xd192e819
+.long  0x5565a910,0xd6990624
+.long  0x5771202a,0xf40e3585
+.long  0x32bbd1b8,0x106aa070
+.long  0xb8d2d0c8,0x19a4c116
+.long  0x5141ab53,0x1e376c08
+.long  0xdf8eeb99,0x2748774c
+.long  0xe19b48a8,0x34b0bcb5
+.long  0xc5c95a63,0x391c0cb3
+.long  0xe3418acb,0x4ed8aa4a
+.long  0x7763e373,0x5b9cca4f
+.long  0xd6b2b8a3,0x682e6ff3
+.long  0x5defb2fc,0x748f82ee
+.long  0x43172f60,0x78a5636f
+.long  0xa1f0ab72,0x84c87814
+.long  0x1a6439ec,0x8cc70208
+.long  0x23631e28,0x90befffa
+.long  0xde82bde9,0xa4506ceb
+.long  0xb2c67915,0xbef9a3f7
+.long  0xe372532b,0xc67178f2
+.long  0xea26619c,0xca273ece
+.long  0x21c0c207,0xd186b8c7
+.long  0xcde0eb1e,0xeada7dd6
+.long  0xee6ed178,0xf57d4f7f
+.long  0x72176fba,0x06f067aa
+.long  0xa2c898a6,0x0a637dc5
+.long  0xbef90dae,0x113f9804
+.long  0x131c471b,0x1b710b35
+.long  0x23047d84,0x28db77f5
+.long  0x40c72493,0x32caab7b
+.long  0x15c9bebc,0x3c9ebe0a
+.long  0x9c100d4c,0x431d67c4
+.long  0xcb3e42b6,0x4cc5d4be
+.long  0xfc657e2a,0x597f299c
+.long  0x3ad6faec,0x5fcb6fab
+.long  0x4a475817,0x6c44198c
+
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha256-x86_64.S b/module/icp/asm-x86_64/sha2/sha256-x86_64.S
new file mode 100644 (file)
index 0000000..f78cd5f
--- /dev/null
@@ -0,0 +1,5104 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__x86_64)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+.section .rodata
+
+.align 64
+.type  K256,@object
+K256:
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+.long  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0x03020100,0x0b0a0908,0xffffffff,0xffffffff
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+.long  0xffffffff,0xffffffff,0x03020100,0x0b0a0908
+
+ENTRY_ALIGN(zfs_sha256_transform_x64, 16)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $64+32,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue:
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       jmp     .Lloop
+.align 16
+.Lloop:
+       movl    %ebx,%edi
+       leaq    K256(%rip),%rbp
+       xorl    %ecx,%edi
+       movl    0(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    4(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    8(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    12(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    16(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    20(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    24(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    28(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%eax
+       movl    32(%rsi),%r12d
+       movl    %r8d,%r13d
+       movl    %eax,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r11d
+       movl    36(%rsi),%r12d
+       movl    %edx,%r13d
+       movl    %r11d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r10d
+       movl    40(%rsi),%r12d
+       movl    %ecx,%r13d
+       movl    %r10d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%r9d
+       movl    44(%rsi),%r12d
+       movl    %ebx,%r13d
+       movl    %r9d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+       leaq    20(%rbp),%rbp
+       addl    %r14d,%r8d
+       movl    48(%rsi),%r12d
+       movl    %eax,%r13d
+       movl    %r8d,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%edx
+       movl    52(%rsi),%r12d
+       movl    %r11d,%r13d
+       movl    %edx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ecx
+       movl    56(%rsi),%r12d
+       movl    %r10d,%r13d
+       movl    %ecx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+       leaq    4(%rbp),%rbp
+       addl    %r14d,%ebx
+       movl    60(%rsi),%r12d
+       movl    %r9d,%r13d
+       movl    %ebx,%r14d
+       bswapl  %r12d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+       leaq    20(%rbp),%rbp
+       jmp     .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+       movl    4(%rsp),%r13d
+       movl    56(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    36(%rsp),%r12d
+       addl    0(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+       movl    %r12d,0(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+       leaq    4(%rbp),%rbp
+       movl    8(%rsp),%r13d
+       movl    60(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    40(%rsp),%r12d
+       addl    4(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+       movl    %r12d,4(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+       leaq    4(%rbp),%rbp
+       movl    12(%rsp),%r13d
+       movl    0(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    44(%rsp),%r12d
+       addl    8(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+       movl    %r12d,8(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+       leaq    4(%rbp),%rbp
+       movl    16(%rsp),%r13d
+       movl    4(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    48(%rsp),%r12d
+       addl    12(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+       movl    %r12d,12(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+       leaq    20(%rbp),%rbp
+       movl    20(%rsp),%r13d
+       movl    8(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    52(%rsp),%r12d
+       addl    16(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+       movl    %r12d,16(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+       leaq    4(%rbp),%rbp
+       movl    24(%rsp),%r13d
+       movl    12(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    56(%rsp),%r12d
+       addl    20(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+       movl    %r12d,20(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+       leaq    4(%rbp),%rbp
+       movl    28(%rsp),%r13d
+       movl    16(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    60(%rsp),%r12d
+       addl    24(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+       movl    %r12d,24(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+       leaq    4(%rbp),%rbp
+       movl    32(%rsp),%r13d
+       movl    20(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    0(%rsp),%r12d
+       addl    28(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+       movl    %r12d,28(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+       leaq    20(%rbp),%rbp
+       movl    36(%rsp),%r13d
+       movl    24(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%eax
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    4(%rsp),%r12d
+       addl    32(%rsp),%r12d
+       movl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r9d,%r15d
+       xorl    %r8d,%r13d
+       rorl    $9,%r14d
+       xorl    %r10d,%r15d
+       movl    %r12d,32(%rsp)
+       xorl    %eax,%r14d
+       andl    %r8d,%r15d
+       rorl    $5,%r13d
+       addl    %r11d,%r12d
+       xorl    %r10d,%r15d
+       rorl    $11,%r14d
+       xorl    %r8d,%r13d
+       addl    %r15d,%r12d
+       movl    %eax,%r15d
+       addl    (%rbp),%r12d
+       xorl    %eax,%r14d
+       xorl    %ebx,%r15d
+       rorl    $6,%r13d
+       movl    %ebx,%r11d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r11d
+       addl    %r12d,%edx
+       addl    %r12d,%r11d
+       leaq    4(%rbp),%rbp
+       movl    40(%rsp),%r13d
+       movl    28(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r11d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    8(%rsp),%r12d
+       addl    36(%rsp),%r12d
+       movl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r8d,%edi
+       xorl    %edx,%r13d
+       rorl    $9,%r14d
+       xorl    %r9d,%edi
+       movl    %r12d,36(%rsp)
+       xorl    %r11d,%r14d
+       andl    %edx,%edi
+       rorl    $5,%r13d
+       addl    %r10d,%r12d
+       xorl    %r9d,%edi
+       rorl    $11,%r14d
+       xorl    %edx,%r13d
+       addl    %edi,%r12d
+       movl    %r11d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r11d,%r14d
+       xorl    %eax,%edi
+       rorl    $6,%r13d
+       movl    %eax,%r10d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r10d
+       addl    %r12d,%ecx
+       addl    %r12d,%r10d
+       leaq    4(%rbp),%rbp
+       movl    44(%rsp),%r13d
+       movl    32(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r10d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    12(%rsp),%r12d
+       addl    40(%rsp),%r12d
+       movl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %edx,%r15d
+       xorl    %ecx,%r13d
+       rorl    $9,%r14d
+       xorl    %r8d,%r15d
+       movl    %r12d,40(%rsp)
+       xorl    %r10d,%r14d
+       andl    %ecx,%r15d
+       rorl    $5,%r13d
+       addl    %r9d,%r12d
+       xorl    %r8d,%r15d
+       rorl    $11,%r14d
+       xorl    %ecx,%r13d
+       addl    %r15d,%r12d
+       movl    %r10d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r10d,%r14d
+       xorl    %r11d,%r15d
+       rorl    $6,%r13d
+       movl    %r11d,%r9d
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%r9d
+       addl    %r12d,%ebx
+       addl    %r12d,%r9d
+       leaq    4(%rbp),%rbp
+       movl    48(%rsp),%r13d
+       movl    36(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r9d
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    16(%rsp),%r12d
+       addl    44(%rsp),%r12d
+       movl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %ecx,%edi
+       xorl    %ebx,%r13d
+       rorl    $9,%r14d
+       xorl    %edx,%edi
+       movl    %r12d,44(%rsp)
+       xorl    %r9d,%r14d
+       andl    %ebx,%edi
+       rorl    $5,%r13d
+       addl    %r8d,%r12d
+       xorl    %edx,%edi
+       rorl    $11,%r14d
+       xorl    %ebx,%r13d
+       addl    %edi,%r12d
+       movl    %r9d,%edi
+       addl    (%rbp),%r12d
+       xorl    %r9d,%r14d
+       xorl    %r10d,%edi
+       rorl    $6,%r13d
+       movl    %r10d,%r8d
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%r8d
+       addl    %r12d,%eax
+       addl    %r12d,%r8d
+       leaq    20(%rbp),%rbp
+       movl    52(%rsp),%r13d
+       movl    40(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%r8d
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    20(%rsp),%r12d
+       addl    48(%rsp),%r12d
+       movl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %ebx,%r15d
+       xorl    %eax,%r13d
+       rorl    $9,%r14d
+       xorl    %ecx,%r15d
+       movl    %r12d,48(%rsp)
+       xorl    %r8d,%r14d
+       andl    %eax,%r15d
+       rorl    $5,%r13d
+       addl    %edx,%r12d
+       xorl    %ecx,%r15d
+       rorl    $11,%r14d
+       xorl    %eax,%r13d
+       addl    %r15d,%r12d
+       movl    %r8d,%r15d
+       addl    (%rbp),%r12d
+       xorl    %r8d,%r14d
+       xorl    %r9d,%r15d
+       rorl    $6,%r13d
+       movl    %r9d,%edx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%edx
+       addl    %r12d,%r11d
+       addl    %r12d,%edx
+       leaq    4(%rbp),%rbp
+       movl    56(%rsp),%r13d
+       movl    44(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%edx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    24(%rsp),%r12d
+       addl    52(%rsp),%r12d
+       movl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %eax,%edi
+       xorl    %r11d,%r13d
+       rorl    $9,%r14d
+       xorl    %ebx,%edi
+       movl    %r12d,52(%rsp)
+       xorl    %edx,%r14d
+       andl    %r11d,%edi
+       rorl    $5,%r13d
+       addl    %ecx,%r12d
+       xorl    %ebx,%edi
+       rorl    $11,%r14d
+       xorl    %r11d,%r13d
+       addl    %edi,%r12d
+       movl    %edx,%edi
+       addl    (%rbp),%r12d
+       xorl    %edx,%r14d
+       xorl    %r8d,%edi
+       rorl    $6,%r13d
+       movl    %r8d,%ecx
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%ecx
+       addl    %r12d,%r10d
+       addl    %r12d,%ecx
+       leaq    4(%rbp),%rbp
+       movl    60(%rsp),%r13d
+       movl    48(%rsp),%r15d
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ecx
+       movl    %r15d,%r14d
+       rorl    $2,%r15d
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%r15d
+       shrl    $10,%r14d
+       rorl    $17,%r15d
+       xorl    %r13d,%r12d
+       xorl    %r14d,%r15d
+       addl    28(%rsp),%r12d
+       addl    56(%rsp),%r12d
+       movl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r11d,%r15d
+       xorl    %r10d,%r13d
+       rorl    $9,%r14d
+       xorl    %eax,%r15d
+       movl    %r12d,56(%rsp)
+       xorl    %ecx,%r14d
+       andl    %r10d,%r15d
+       rorl    $5,%r13d
+       addl    %ebx,%r12d
+       xorl    %eax,%r15d
+       rorl    $11,%r14d
+       xorl    %r10d,%r13d
+       addl    %r15d,%r12d
+       movl    %ecx,%r15d
+       addl    (%rbp),%r12d
+       xorl    %ecx,%r14d
+       xorl    %edx,%r15d
+       rorl    $6,%r13d
+       movl    %edx,%ebx
+       andl    %r15d,%edi
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %edi,%ebx
+       addl    %r12d,%r9d
+       addl    %r12d,%ebx
+       leaq    4(%rbp),%rbp
+       movl    0(%rsp),%r13d
+       movl    52(%rsp),%edi
+       movl    %r13d,%r12d
+       rorl    $11,%r13d
+       addl    %r14d,%ebx
+       movl    %edi,%r14d
+       rorl    $2,%edi
+       xorl    %r12d,%r13d
+       shrl    $3,%r12d
+       rorl    $7,%r13d
+       xorl    %r14d,%edi
+       shrl    $10,%r14d
+       rorl    $17,%edi
+       xorl    %r13d,%r12d
+       xorl    %r14d,%edi
+       addl    32(%rsp),%r12d
+       addl    60(%rsp),%r12d
+       movl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r10d,%edi
+       xorl    %r9d,%r13d
+       rorl    $9,%r14d
+       xorl    %r11d,%edi
+       movl    %r12d,60(%rsp)
+       xorl    %ebx,%r14d
+       andl    %r9d,%edi
+       rorl    $5,%r13d
+       addl    %eax,%r12d
+       xorl    %r11d,%edi
+       rorl    $11,%r14d
+       xorl    %r9d,%r13d
+       addl    %edi,%r12d
+       movl    %ebx,%edi
+       addl    (%rbp),%r12d
+       xorl    %ebx,%r14d
+       xorl    %ecx,%edi
+       rorl    $6,%r13d
+       movl    %ecx,%eax
+       andl    %edi,%r15d
+       rorl    $2,%r14d
+       addl    %r13d,%r12d
+       xorl    %r15d,%eax
+       addl    %r12d,%r8d
+       addl    %r12d,%eax
+       leaq    20(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jnz     .Lrounds_16_xx
+       movq    64+0(%rsp),%rdi
+       addl    %r14d,%eax
+       leaq    64(%rsi),%rsi
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+       cmpq    64+16(%rsp),%rsi
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_x64)
+
+ENTRY_ALIGN(zfs_sha256_transform_shani, 64)
+.cfi_startproc
+       ENDBR
+       leaq    K256+128(%rip),%rcx
+       movdqu  (%rdi),%xmm1
+       movdqu  16(%rdi),%xmm2
+       movdqa  512-128(%rcx),%xmm7
+
+       pshufd  $0x1b,%xmm1,%xmm0
+       pshufd  $0xb1,%xmm1,%xmm1
+       pshufd  $0x1b,%xmm2,%xmm2
+       movdqa  %xmm7,%xmm8
+.byte  102,15,58,15,202,8
+       punpcklqdq      %xmm0,%xmm2
+       jmp     .Loop_shani
+
+.align 16
+.Loop_shani:
+       movdqu  (%rsi),%xmm3
+       movdqu  16(%rsi),%xmm4
+       movdqu  32(%rsi),%xmm5
+.byte  102,15,56,0,223
+       movdqu  48(%rsi),%xmm6
+
+       movdqa  0-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  102,15,56,0,231
+       movdqa  %xmm2,%xmm10
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       nop
+       movdqa  %xmm1,%xmm9
+.byte  15,56,203,202
+
+       movdqa  32-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  102,15,56,0,239
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       leaq    64(%rsi),%rsi
+.byte  15,56,204,220
+.byte  15,56,203,202
+
+       movdqa  64-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  102,15,56,0,247
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+
+       movdqa  96-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  128-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  160-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  192-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  224-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  256-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  288-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+       nop
+       paddd   %xmm7,%xmm6
+.byte  15,56,204,220
+.byte  15,56,203,202
+       movdqa  320-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,205,245
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm6,%xmm7
+.byte  102,15,58,15,253,4
+       nop
+       paddd   %xmm7,%xmm3
+.byte  15,56,204,229
+.byte  15,56,203,202
+       movdqa  352-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+.byte  15,56,205,222
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm3,%xmm7
+.byte  102,15,58,15,254,4
+       nop
+       paddd   %xmm7,%xmm4
+.byte  15,56,204,238
+.byte  15,56,203,202
+       movdqa  384-128(%rcx),%xmm0
+       paddd   %xmm3,%xmm0
+.byte  15,56,205,227
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm4,%xmm7
+.byte  102,15,58,15,251,4
+       nop
+       paddd   %xmm7,%xmm5
+.byte  15,56,204,243
+.byte  15,56,203,202
+       movdqa  416-128(%rcx),%xmm0
+       paddd   %xmm4,%xmm0
+.byte  15,56,205,236
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       movdqa  %xmm5,%xmm7
+.byte  102,15,58,15,252,4
+.byte  15,56,203,202
+       paddd   %xmm7,%xmm6
+
+       movdqa  448-128(%rcx),%xmm0
+       paddd   %xmm5,%xmm0
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+.byte  15,56,205,245
+       movdqa  %xmm8,%xmm7
+.byte  15,56,203,202
+
+       movdqa  480-128(%rcx),%xmm0
+       paddd   %xmm6,%xmm0
+       nop
+.byte  15,56,203,209
+       pshufd  $0x0e,%xmm0,%xmm0
+       decq    %rdx
+       nop
+.byte  15,56,203,202
+
+       paddd   %xmm10,%xmm2
+       paddd   %xmm9,%xmm1
+       jnz     .Loop_shani
+
+       pshufd  $0xb1,%xmm2,%xmm2
+       pshufd  $0x1b,%xmm1,%xmm7
+       pshufd  $0xb1,%xmm1,%xmm1
+       punpckhqdq      %xmm2,%xmm1
+.byte  102,15,58,15,215,8
+
+       movdqu  %xmm1,(%rdi)
+       movdqu  %xmm2,16(%rdi)
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_shani)
+
+ENTRY_ALIGN(zfs_sha256_transform_ssse3, 64)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_ssse3:
+
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+
+       jmp     .Lloop_ssse3
+.align 16
+.Lloop_ssse3:
+       movdqa  K256+512(%rip),%xmm7
+       movdqu  0(%rsi),%xmm0
+       movdqu  16(%rsi),%xmm1
+       movdqu  32(%rsi),%xmm2
+.byte  102,15,56,0,199
+       movdqu  48(%rsi),%xmm3
+       leaq    K256(%rip),%rbp
+.byte  102,15,56,0,207
+       movdqa  0(%rbp),%xmm4
+       movdqa  32(%rbp),%xmm5
+.byte  102,15,56,0,215
+       paddd   %xmm0,%xmm4
+       movdqa  64(%rbp),%xmm6
+.byte  102,15,56,0,223
+       movdqa  96(%rbp),%xmm7
+       paddd   %xmm1,%xmm5
+       paddd   %xmm2,%xmm6
+       paddd   %xmm3,%xmm7
+       movdqa  %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       movdqa  %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       movdqa  %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       movdqa  %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lssse3_00_47
+
+.align 16
+.Lssse3_00_47:
+       subq    $-128,%rbp
+       rorl    $14,%r13d
+       movdqa  %xmm1,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm3,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,224,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,250,4
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm3,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm0
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm0
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm0,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  0(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm0
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm0,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,0(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm2,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm0,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,225,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,251,4
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm0,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm1
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm1
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm1,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  32(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm1
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm1,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,16(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm3,%xmm4
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       movdqa  %xmm1,%xmm7
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+.byte  102,15,58,15,226,4
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+.byte  102,15,58,15,248,4
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       psrld   $7,%xmm6
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       pshufd  $250,%xmm1,%xmm7
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %r11d,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       pslld   $11,%xmm5
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       paddd   %xmm4,%xmm2
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       psrlq   $17,%xmm6
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       psrldq  $8,%xmm7
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm2
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       pshufd  $80,%xmm2,%xmm7
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       psrld   $10,%xmm7
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       psrlq   $2,%xmm6
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       pxor    %xmm6,%xmm7
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       movdqa  64(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       paddd   %xmm7,%xmm2
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       paddd   %xmm2,%xmm6
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       movdqa  %xmm6,32(%rsp)
+       rorl    $14,%r13d
+       movdqa  %xmm0,%xmm4
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       movdqa  %xmm2,%xmm7
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+.byte  102,15,58,15,227,4
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+.byte  102,15,58,15,249,4
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm4,%xmm5
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       movdqa  %xmm4,%xmm6
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       psrld   $3,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       psrld   $7,%xmm6
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       pshufd  $250,%xmm2,%xmm7
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       pslld   $14,%xmm5
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       pxor    %xmm6,%xmm4
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       psrld   $11,%xmm6
+       xorl    %edx,%r14d
+       pxor    %xmm5,%xmm4
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       pslld   $11,%xmm5
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       pxor    %xmm6,%xmm4
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       movdqa  %xmm7,%xmm6
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       pxor    %xmm5,%xmm4
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       psrld   $10,%xmm7
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       paddd   %xmm4,%xmm3
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       psrlq   $17,%xmm6
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       pxor    %xmm6,%xmm7
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       psrlq   $2,%xmm6
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       pshufd  $128,%xmm7,%xmm7
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       psrldq  $8,%xmm7
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       paddd   %xmm7,%xmm3
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       pshufd  $80,%xmm3,%xmm7
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       movdqa  %xmm7,%xmm6
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       psrld   $10,%xmm7
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       psrlq   $17,%xmm6
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       pxor    %xmm6,%xmm7
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       psrlq   $2,%xmm6
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       pxor    %xmm6,%xmm7
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       pshufd  $8,%xmm7,%xmm7
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       movdqa  96(%rbp),%xmm6
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       pslldq  $8,%xmm7
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       paddd   %xmm7,%xmm3
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       paddd   %xmm3,%xmm6
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movdqa  %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lssse3_00_47
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       rorl    $9,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       rorl    $5,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       rorl    $11,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       rorl    $2,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       rorl    $9,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       rorl    $5,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       rorl    $11,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       rorl    $2,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       rorl    $9,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       rorl    $5,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       rorl    $11,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       rorl    $2,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       rorl    $9,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       rorl    $5,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       rorl    $11,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       rorl    $2,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       rorl    $9,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       rorl    $5,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       rorl    $11,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       rorl    $2,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       rorl    $9,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       rorl    $5,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       rorl    $11,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       rorl    $2,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       rorl    $9,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       rorl    $5,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       rorl    $11,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       rorl    $6,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       rorl    $2,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       rorl    $14,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       rorl    $9,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       rorl    $5,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       rorl    $11,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       rorl    $6,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       rorl    $2,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_ssse3
+
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_ssse3:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_ssse3)
+
+ENTRY_ALIGN(zfs_sha256_transform_avx, 64)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $96,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx:
+
+       vzeroupper
+       movl    0(%rdi),%eax
+       movl    4(%rdi),%ebx
+       movl    8(%rdi),%ecx
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%xmm8
+       vmovdqa K256+512+64(%rip),%xmm9
+       jmp     .Lloop_avx
+.align 16
+.Lloop_avx:
+       vmovdqa K256+512(%rip),%xmm7
+       vmovdqu 0(%rsi),%xmm0
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm7,%xmm0,%xmm0
+       leaq    K256(%rip),%rbp
+       vpshufb %xmm7,%xmm1,%xmm1
+       vpshufb %xmm7,%xmm2,%xmm2
+       vpaddd  0(%rbp),%xmm0,%xmm4
+       vpshufb %xmm7,%xmm3,%xmm3
+       vpaddd  32(%rbp),%xmm1,%xmm5
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       vpaddd  96(%rbp),%xmm3,%xmm7
+       vmovdqa %xmm4,0(%rsp)
+       movl    %eax,%r14d
+       vmovdqa %xmm5,16(%rsp)
+       movl    %ebx,%edi
+       vmovdqa %xmm6,32(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %xmm7,48(%rsp)
+       movl    %r8d,%r13d
+       jmp     .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+       subq    $-128,%rbp
+       vpalignr        $4,%xmm0,%xmm1,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm2,%xmm3,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm0,%xmm0
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm3,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm0,%xmm0
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       vpshufd $80,%xmm0,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm0,%xmm0
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  0(%rbp),%xmm0,%xmm6
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,0(%rsp)
+       vpalignr        $4,%xmm1,%xmm2,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm3,%xmm0,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm1,%xmm1
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm0,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm1,%xmm1
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       vpshufd $80,%xmm1,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm1,%xmm1
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  32(%rbp),%xmm1,%xmm6
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,16(%rsp)
+       vpalignr        $4,%xmm2,%xmm3,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       vpalignr        $4,%xmm0,%xmm1,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       vpaddd  %xmm7,%xmm2,%xmm2
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       vpshufd $250,%xmm1,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       vpaddd  %xmm4,%xmm2,%xmm2
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       vpshufd $80,%xmm2,%xmm7
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       vpaddd  %xmm6,%xmm2,%xmm2
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       vpaddd  64(%rbp),%xmm2,%xmm6
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       vmovdqa %xmm6,32(%rsp)
+       vpalignr        $4,%xmm3,%xmm0,%xmm4
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       vpalignr        $4,%xmm1,%xmm2,%xmm7
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       vpsrld  $7,%xmm4,%xmm6
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       vpaddd  %xmm7,%xmm3,%xmm3
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       vpsrld  $3,%xmm4,%xmm7
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       vpslld  $14,%xmm4,%xmm5
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       vpxor   %xmm6,%xmm7,%xmm4
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       vpshufd $250,%xmm2,%xmm7
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       vpsrld  $11,%xmm6,%xmm6
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       vpxor   %xmm5,%xmm4,%xmm4
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       vpslld  $11,%xmm5,%xmm5
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       vpxor   %xmm6,%xmm4,%xmm4
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       vpsrld  $10,%xmm7,%xmm6
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       vpxor   %xmm5,%xmm4,%xmm4
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       vpsrlq  $17,%xmm7,%xmm7
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       vpaddd  %xmm4,%xmm3,%xmm3
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       vpxor   %xmm7,%xmm6,%xmm6
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       vpshufb %xmm8,%xmm6,%xmm6
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       vpshufd $80,%xmm3,%xmm7
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       vpsrld  $10,%xmm7,%xmm6
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       vpsrlq  $17,%xmm7,%xmm7
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       vpxor   %xmm7,%xmm6,%xmm6
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       vpsrlq  $2,%xmm7,%xmm7
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       vpxor   %xmm7,%xmm6,%xmm6
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       vpshufb %xmm9,%xmm6,%xmm6
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       vpaddd  %xmm6,%xmm3,%xmm3
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       vpaddd  96(%rbp),%xmm3,%xmm6
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       vmovdqa %xmm6,48(%rsp)
+       cmpb    $0,131(%rbp)
+       jne     .Lavx_00_47
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    0(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    4(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    8(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    12(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    16(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    20(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    24(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    28(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%eax
+       movl    %r9d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r8d,%r13d
+       xorl    %r10d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %eax,%r14d
+       andl    %r8d,%r12d
+       xorl    %r8d,%r13d
+       addl    32(%rsp),%r11d
+       movl    %eax,%r15d
+       xorl    %r10d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ebx,%r15d
+       addl    %r12d,%r11d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %eax,%r14d
+       addl    %r13d,%r11d
+       xorl    %ebx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r11d,%edx
+       addl    %edi,%r11d
+       movl    %edx,%r13d
+       addl    %r11d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r11d
+       movl    %r8d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %edx,%r13d
+       xorl    %r9d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r11d,%r14d
+       andl    %edx,%r12d
+       xorl    %edx,%r13d
+       addl    36(%rsp),%r10d
+       movl    %r11d,%edi
+       xorl    %r9d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %eax,%edi
+       addl    %r12d,%r10d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r11d,%r14d
+       addl    %r13d,%r10d
+       xorl    %eax,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r10d,%ecx
+       addl    %r15d,%r10d
+       movl    %ecx,%r13d
+       addl    %r10d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r10d
+       movl    %edx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ecx,%r13d
+       xorl    %r8d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r10d,%r14d
+       andl    %ecx,%r12d
+       xorl    %ecx,%r13d
+       addl    40(%rsp),%r9d
+       movl    %r10d,%r15d
+       xorl    %r8d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r11d,%r15d
+       addl    %r12d,%r9d
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r10d,%r14d
+       addl    %r13d,%r9d
+       xorl    %r11d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %r9d,%ebx
+       addl    %edi,%r9d
+       movl    %ebx,%r13d
+       addl    %r9d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r9d
+       movl    %ecx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %ebx,%r13d
+       xorl    %edx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r9d,%r14d
+       andl    %ebx,%r12d
+       xorl    %ebx,%r13d
+       addl    44(%rsp),%r8d
+       movl    %r9d,%edi
+       xorl    %edx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r10d,%edi
+       addl    %r12d,%r8d
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %r9d,%r14d
+       addl    %r13d,%r8d
+       xorl    %r10d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %r8d,%eax
+       addl    %r15d,%r8d
+       movl    %eax,%r13d
+       addl    %r8d,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%r8d
+       movl    %ebx,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %eax,%r13d
+       xorl    %ecx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %r8d,%r14d
+       andl    %eax,%r12d
+       xorl    %eax,%r13d
+       addl    48(%rsp),%edx
+       movl    %r8d,%r15d
+       xorl    %ecx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r9d,%r15d
+       addl    %r12d,%edx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %r8d,%r14d
+       addl    %r13d,%edx
+       xorl    %r9d,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %edx,%r11d
+       addl    %edi,%edx
+       movl    %r11d,%r13d
+       addl    %edx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%edx
+       movl    %eax,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r11d,%r13d
+       xorl    %ebx,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %edx,%r14d
+       andl    %r11d,%r12d
+       xorl    %r11d,%r13d
+       addl    52(%rsp),%ecx
+       movl    %edx,%edi
+       xorl    %ebx,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %r8d,%edi
+       addl    %r12d,%ecx
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %edx,%r14d
+       addl    %r13d,%ecx
+       xorl    %r8d,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %ecx,%r10d
+       addl    %r15d,%ecx
+       movl    %r10d,%r13d
+       addl    %ecx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ecx
+       movl    %r11d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r10d,%r13d
+       xorl    %eax,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ecx,%r14d
+       andl    %r10d,%r12d
+       xorl    %r10d,%r13d
+       addl    56(%rsp),%ebx
+       movl    %ecx,%r15d
+       xorl    %eax,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %edx,%r15d
+       addl    %r12d,%ebx
+       shrdl   $6,%r13d,%r13d
+       andl    %r15d,%edi
+       xorl    %ecx,%r14d
+       addl    %r13d,%ebx
+       xorl    %edx,%edi
+       shrdl   $2,%r14d,%r14d
+       addl    %ebx,%r9d
+       addl    %edi,%ebx
+       movl    %r9d,%r13d
+       addl    %ebx,%r14d
+       shrdl   $14,%r13d,%r13d
+       movl    %r14d,%ebx
+       movl    %r10d,%r12d
+       shrdl   $9,%r14d,%r14d
+       xorl    %r9d,%r13d
+       xorl    %r11d,%r12d
+       shrdl   $5,%r13d,%r13d
+       xorl    %ebx,%r14d
+       andl    %r9d,%r12d
+       xorl    %r9d,%r13d
+       addl    60(%rsp),%eax
+       movl    %ebx,%edi
+       xorl    %r11d,%r12d
+       shrdl   $11,%r14d,%r14d
+       xorl    %ecx,%edi
+       addl    %r12d,%eax
+       shrdl   $6,%r13d,%r13d
+       andl    %edi,%r15d
+       xorl    %ebx,%r14d
+       addl    %r13d,%eax
+       xorl    %ecx,%r15d
+       shrdl   $2,%r14d,%r14d
+       addl    %eax,%r8d
+       addl    %r15d,%eax
+       movl    %r8d,%r13d
+       addl    %eax,%r14d
+       movq    64+0(%rsp),%rdi
+       movl    %r14d,%eax
+
+       addl    0(%rdi),%eax
+       leaq    64(%rsi),%rsi
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+       jb      .Lloop_avx
+
+       movq    88(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_avx)
+
+ENTRY_ALIGN(zfs_sha256_transform_avx2, 64)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       subq    $544,%rsp
+       shlq    $4,%rdx
+       andq    $-1024,%rsp
+       leaq    (%rsi,%rdx,4),%rdx
+       addq    $448,%rsp
+       movq    %rdi,64+0(%rsp)
+       movq    %rsi,64+8(%rsp)
+       movq    %rdx,64+16(%rsp)
+       movq    %rax,88(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-64,%rsi
+       movl    0(%rdi),%eax
+       movq    %rsi,%r12
+       movl    4(%rdi),%ebx
+       cmpq    %rdx,%rsi
+       movl    8(%rdi),%ecx
+       cmoveq  %rsp,%r12
+       movl    12(%rdi),%edx
+       movl    16(%rdi),%r8d
+       movl    20(%rdi),%r9d
+       movl    24(%rdi),%r10d
+       movl    28(%rdi),%r11d
+       vmovdqa K256+512+32(%rip),%ymm8
+       vmovdqa K256+512+64(%rip),%ymm9
+       jmp     .Loop_avx2
+.align 16
+.Loop_avx2:
+       vmovdqa K256+512(%rip),%ymm7
+       vmovdqu -64+0(%rsi),%xmm0
+       vmovdqu -64+16(%rsi),%xmm1
+       vmovdqu -64+32(%rsi),%xmm2
+       vmovdqu -64+48(%rsi),%xmm3
+
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm7,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm7,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+
+       leaq    K256(%rip),%rbp
+       vpshufb %ymm7,%ymm2,%ymm2
+       vpaddd  0(%rbp),%ymm0,%ymm4
+       vpshufb %ymm7,%ymm3,%ymm3
+       vpaddd  32(%rbp),%ymm1,%ymm5
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       vpaddd  96(%rbp),%ymm3,%ymm7
+       vmovdqa %ymm4,0(%rsp)
+       xorl    %r14d,%r14d
+       vmovdqa %ymm5,32(%rsp)
+
+       movq    88(%rsp),%rdi
+.cfi_def_cfa   %rdi,8
+       leaq    -64(%rsp),%rsp
+
+
+
+       movq    %rdi,-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       movl    %ebx,%edi
+       vmovdqa %ymm6,0(%rsp)
+       xorl    %ecx,%edi
+       vmovdqa %ymm7,32(%rsp)
+       movl    %r9d,%r12d
+       subq    $-32*4,%rbp
+       jmp     .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+       leaq    -64(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+       pushq   64-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+       leaq    8(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       vpalignr        $4,%ymm0,%ymm1,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm2,%ymm3,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm0,%ymm0
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm3,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm0,%ymm0
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm0,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm0,%ymm0
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  0(%rbp),%ymm0,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm1,%ymm2,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm3,%ymm0,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm1,%ymm1
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm0,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm1,%ymm1
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm1,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm1,%ymm1
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  32(%rbp),%ymm1,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    -64(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x38,0x06,0x23,0x08
+
+       pushq   64-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+       leaq    8(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       vpalignr        $4,%ymm2,%ymm3,%ymm4
+       addl    0+128(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       vpalignr        $4,%ymm0,%ymm1,%ymm7
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       vpaddd  %ymm7,%ymm2,%ymm2
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       vpshufd $250,%ymm1,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    4+128(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       vpaddd  %ymm4,%ymm2,%ymm2
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    8+128(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       vpshufd $80,%ymm2,%ymm7
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    12+128(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       vpaddd  %ymm6,%ymm2,%ymm2
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       vpaddd  64(%rbp),%ymm2,%ymm6
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       vmovdqa %ymm6,0(%rsp)
+       vpalignr        $4,%ymm3,%ymm0,%ymm4
+       addl    32+128(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       vpalignr        $4,%ymm1,%ymm2,%ymm7
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       vpsrld  $7,%ymm4,%ymm6
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       vpaddd  %ymm7,%ymm3,%ymm3
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       vpsrld  $3,%ymm4,%ymm7
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       vpslld  $14,%ymm4,%ymm5
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       vpxor   %ymm6,%ymm7,%ymm4
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       vpshufd $250,%ymm2,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       vpsrld  $11,%ymm6,%ymm6
+       addl    36+128(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       vpslld  $11,%ymm5,%ymm5
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       vpxor   %ymm6,%ymm4,%ymm4
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       vpsrld  $10,%ymm7,%ymm6
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       vpxor   %ymm5,%ymm4,%ymm4
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       vpsrlq  $17,%ymm7,%ymm7
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       vpaddd  %ymm4,%ymm3,%ymm3
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    40+128(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       vpshufb %ymm8,%ymm6,%ymm6
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       vpshufd $80,%ymm3,%ymm7
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       vpsrld  $10,%ymm7,%ymm6
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       vpsrlq  $17,%ymm7,%ymm7
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       vpxor   %ymm7,%ymm6,%ymm6
+       addl    44+128(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       vpsrlq  $2,%ymm7,%ymm7
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       vpxor   %ymm7,%ymm6,%ymm6
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       vpshufb %ymm9,%ymm6,%ymm6
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       vpaddd  %ymm6,%ymm3,%ymm3
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       vpaddd  96(%rbp),%ymm3,%ymm6
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       vmovdqa %ymm6,32(%rsp)
+       leaq    128(%rbp),%rbp
+       cmpb    $0,3(%rbp)
+       jne     .Lavx2_00_47
+       addl    0+64(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+64(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+64(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+64(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+64(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+64(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+64(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+64(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       addl    0(%rsp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4(%rsp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8(%rsp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12(%rsp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32(%rsp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36(%rsp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40(%rsp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44(%rsp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rbp
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       addl    24(%rdi),%r10d
+       addl    28(%rdi),%r11d
+
+       movl    %eax,0(%rdi)
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       cmpq    80(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorl    %r14d,%r14d
+       movl    %ebx,%edi
+       xorl    %ecx,%edi
+       movl    %r9d,%r12d
+       jmp     .Lower_avx2
+.align 16
+.Lower_avx2:
+       addl    0+16(%rbp),%r11d
+       andl    %r8d,%r12d
+       rorxl   $25,%r8d,%r13d
+       rorxl   $11,%r8d,%r15d
+       leal    (%rax,%r14,1),%eax
+       leal    (%r11,%r12,1),%r11d
+       andnl   %r10d,%r8d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r8d,%r14d
+       leal    (%r11,%r12,1),%r11d
+       xorl    %r14d,%r13d
+       movl    %eax,%r15d
+       rorxl   $22,%eax,%r12d
+       leal    (%r11,%r13,1),%r11d
+       xorl    %ebx,%r15d
+       rorxl   $13,%eax,%r14d
+       rorxl   $2,%eax,%r13d
+       leal    (%rdx,%r11,1),%edx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %ebx,%edi
+       xorl    %r13d,%r14d
+       leal    (%r11,%rdi,1),%r11d
+       movl    %r8d,%r12d
+       addl    4+16(%rbp),%r10d
+       andl    %edx,%r12d
+       rorxl   $25,%edx,%r13d
+       rorxl   $11,%edx,%edi
+       leal    (%r11,%r14,1),%r11d
+       leal    (%r10,%r12,1),%r10d
+       andnl   %r9d,%edx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%edx,%r14d
+       leal    (%r10,%r12,1),%r10d
+       xorl    %r14d,%r13d
+       movl    %r11d,%edi
+       rorxl   $22,%r11d,%r12d
+       leal    (%r10,%r13,1),%r10d
+       xorl    %eax,%edi
+       rorxl   $13,%r11d,%r14d
+       rorxl   $2,%r11d,%r13d
+       leal    (%rcx,%r10,1),%ecx
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %eax,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r10,%r15,1),%r10d
+       movl    %edx,%r12d
+       addl    8+16(%rbp),%r9d
+       andl    %ecx,%r12d
+       rorxl   $25,%ecx,%r13d
+       rorxl   $11,%ecx,%r15d
+       leal    (%r10,%r14,1),%r10d
+       leal    (%r9,%r12,1),%r9d
+       andnl   %r8d,%ecx,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%ecx,%r14d
+       leal    (%r9,%r12,1),%r9d
+       xorl    %r14d,%r13d
+       movl    %r10d,%r15d
+       rorxl   $22,%r10d,%r12d
+       leal    (%r9,%r13,1),%r9d
+       xorl    %r11d,%r15d
+       rorxl   $13,%r10d,%r14d
+       rorxl   $2,%r10d,%r13d
+       leal    (%rbx,%r9,1),%ebx
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r11d,%edi
+       xorl    %r13d,%r14d
+       leal    (%r9,%rdi,1),%r9d
+       movl    %ecx,%r12d
+       addl    12+16(%rbp),%r8d
+       andl    %ebx,%r12d
+       rorxl   $25,%ebx,%r13d
+       rorxl   $11,%ebx,%edi
+       leal    (%r9,%r14,1),%r9d
+       leal    (%r8,%r12,1),%r8d
+       andnl   %edx,%ebx,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%ebx,%r14d
+       leal    (%r8,%r12,1),%r8d
+       xorl    %r14d,%r13d
+       movl    %r9d,%edi
+       rorxl   $22,%r9d,%r12d
+       leal    (%r8,%r13,1),%r8d
+       xorl    %r10d,%edi
+       rorxl   $13,%r9d,%r14d
+       rorxl   $2,%r9d,%r13d
+       leal    (%rax,%r8,1),%eax
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r10d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%r8,%r15,1),%r8d
+       movl    %ebx,%r12d
+       addl    32+16(%rbp),%edx
+       andl    %eax,%r12d
+       rorxl   $25,%eax,%r13d
+       rorxl   $11,%eax,%r15d
+       leal    (%r8,%r14,1),%r8d
+       leal    (%rdx,%r12,1),%edx
+       andnl   %ecx,%eax,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%eax,%r14d
+       leal    (%rdx,%r12,1),%edx
+       xorl    %r14d,%r13d
+       movl    %r8d,%r15d
+       rorxl   $22,%r8d,%r12d
+       leal    (%rdx,%r13,1),%edx
+       xorl    %r9d,%r15d
+       rorxl   $13,%r8d,%r14d
+       rorxl   $2,%r8d,%r13d
+       leal    (%r11,%rdx,1),%r11d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %r9d,%edi
+       xorl    %r13d,%r14d
+       leal    (%rdx,%rdi,1),%edx
+       movl    %eax,%r12d
+       addl    36+16(%rbp),%ecx
+       andl    %r11d,%r12d
+       rorxl   $25,%r11d,%r13d
+       rorxl   $11,%r11d,%edi
+       leal    (%rdx,%r14,1),%edx
+       leal    (%rcx,%r12,1),%ecx
+       andnl   %ebx,%r11d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r11d,%r14d
+       leal    (%rcx,%r12,1),%ecx
+       xorl    %r14d,%r13d
+       movl    %edx,%edi
+       rorxl   $22,%edx,%r12d
+       leal    (%rcx,%r13,1),%ecx
+       xorl    %r8d,%edi
+       rorxl   $13,%edx,%r14d
+       rorxl   $2,%edx,%r13d
+       leal    (%r10,%rcx,1),%r10d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %r8d,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rcx,%r15,1),%ecx
+       movl    %r11d,%r12d
+       addl    40+16(%rbp),%ebx
+       andl    %r10d,%r12d
+       rorxl   $25,%r10d,%r13d
+       rorxl   $11,%r10d,%r15d
+       leal    (%rcx,%r14,1),%ecx
+       leal    (%rbx,%r12,1),%ebx
+       andnl   %eax,%r10d,%r12d
+       xorl    %r15d,%r13d
+       rorxl   $6,%r10d,%r14d
+       leal    (%rbx,%r12,1),%ebx
+       xorl    %r14d,%r13d
+       movl    %ecx,%r15d
+       rorxl   $22,%ecx,%r12d
+       leal    (%rbx,%r13,1),%ebx
+       xorl    %edx,%r15d
+       rorxl   $13,%ecx,%r14d
+       rorxl   $2,%ecx,%r13d
+       leal    (%r9,%rbx,1),%r9d
+       andl    %r15d,%edi
+       xorl    %r12d,%r14d
+       xorl    %edx,%edi
+       xorl    %r13d,%r14d
+       leal    (%rbx,%rdi,1),%ebx
+       movl    %r10d,%r12d
+       addl    44+16(%rbp),%eax
+       andl    %r9d,%r12d
+       rorxl   $25,%r9d,%r13d
+       rorxl   $11,%r9d,%edi
+       leal    (%rbx,%r14,1),%ebx
+       leal    (%rax,%r12,1),%eax
+       andnl   %r11d,%r9d,%r12d
+       xorl    %edi,%r13d
+       rorxl   $6,%r9d,%r14d
+       leal    (%rax,%r12,1),%eax
+       xorl    %r14d,%r13d
+       movl    %ebx,%edi
+       rorxl   $22,%ebx,%r12d
+       leal    (%rax,%r13,1),%eax
+       xorl    %ecx,%edi
+       rorxl   $13,%ebx,%r14d
+       rorxl   $2,%ebx,%r13d
+       leal    (%r8,%rax,1),%r8d
+       andl    %edi,%r15d
+       xorl    %r12d,%r14d
+       xorl    %ecx,%r15d
+       xorl    %r13d,%r14d
+       leal    (%rax,%r15,1),%eax
+       movl    %r9d,%r12d
+       leaq    -64(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    512(%rsp),%rdi
+       addl    %r14d,%eax
+
+       leaq    448(%rsp),%rsp
+
+.cfi_escape    0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08
+
+       addl    0(%rdi),%eax
+       addl    4(%rdi),%ebx
+       addl    8(%rdi),%ecx
+       addl    12(%rdi),%edx
+       addl    16(%rdi),%r8d
+       addl    20(%rdi),%r9d
+       leaq    128(%rsi),%rsi
+       addl    24(%rdi),%r10d
+       movq    %rsi,%r12
+       addl    28(%rdi),%r11d
+       cmpq    64+16(%rsp),%rsi
+
+       movl    %eax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movl    %ebx,4(%rdi)
+       movl    %ecx,8(%rdi)
+       movl    %edx,12(%rdi)
+       movl    %r8d,16(%rdi)
+       movl    %r9d,20(%rdi)
+       movl    %r10d,24(%rdi)
+       movl    %r11d,28(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+
+.cfi_escape    0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08
+
+.Ldone_avx2:
+       movq    88(%rbp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha256_transform_avx2)
+
+#if defined(__ELF__)
+       .section .note.GNU-stack,"",%progbits
+#endif
+#endif
diff --git a/module/icp/asm-x86_64/sha2/sha512-x86_64.S b/module/icp/asm-x86_64/sha2/sha512-x86_64.S
new file mode 100644 (file)
index 0000000..ce8e108
--- /dev/null
@@ -0,0 +1,4011 @@
+/*
+ * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ * - modified assembly to fit into OpenZFS
+ */
+
+#if defined(__x86_64)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+.section .rodata
+
+.align 64
+.type  K512,@object
+K512:
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0x428a2f98d728ae22,0x7137449123ef65cd
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x3956c25bf348b538,0x59f111f1b605d019
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0x923f82a4af194f9b,0xab1c5ed5da6d8118
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0xd807aa98a3030242,0x12835b0145706fbe
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x72be5d74f27b896f,0x80deb1fe3b1696b1
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0x9bdc06a725c71235,0xc19bf174cf692694
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0xe49b69c19ef14ad2,0xefbe4786384f25e3
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x2de92c6f592b0275,0x4a7484aa6ea6e483
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0x983e5152ee66dfab,0xa831c66d2db43210
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xb00327c898fb213f,0xbf597fc7beef0ee4
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0xc6e00bf33da88fc2,0xd5a79147930aa725
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x06ca6351e003826f,0x142929670a0e6e70
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x27b70a8546d22ffc,0x2e1b21385c26c926
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x650a73548baf63de,0x766a0abb3c77b2a8
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0x81c2c92e47edaee6,0x92722c851482353b
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xa2bfe8a14cf10364,0xa81a664bbc423001
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xc24b8b70d0f89791,0xc76c51a30654be30
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xd192e819d6ef5218,0xd69906245565a910
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0xf40e35855771202a,0x106aa07032bbd1b8
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x19a4c116b8d2d0c8,0x1e376c085141ab53
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x748f82ee5defb2fc,0x78a5636f43172f60
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x84c87814a1f0ab72,0x8cc702081a6439ec
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0x90befffa23631e28,0xa4506cebde82bde9
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xbef9a3f7b2c67915,0xc67178f2e372532b
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xca273eceea26619c,0xd186b8c721c0c207
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x06f067aa72176fba,0x0a637dc5a2c898a6
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x113f9804bef90dae,0x1b710b35131c471b
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x28db77f523047d84,0x32caab7b40c72493
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad  0x5fcb6fab3ad6faec,0x6c44198c4a475817
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+.quad  0x0001020304050607,0x08090a0b0c0d0e0f
+
+ENTRY_ALIGN(zfs_sha512_transform_x64, 16)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $128+32,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue:
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop
+.align 16
+.Lloop:
+       movq    %rbx,%rdi
+       leaq    K512(%rip),%rbp
+       xorq    %rcx,%rdi
+       movq    0(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    8(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    16(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    24(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    32(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    40(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    48(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    56(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rax
+       movq    64(%rsi),%r12
+       movq    %r8,%r13
+       movq    %rax,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r9,%r15
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r11
+       movq    72(%rsi),%r12
+       movq    %rdx,%r13
+       movq    %r11,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r8,%rdi
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r10
+       movq    80(%rsi),%r12
+       movq    %rcx,%r13
+       movq    %r10,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rdx,%r15
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+       leaq    8(%rbp),%rbp
+       addq    %r14,%r9
+       movq    88(%rsi),%r12
+       movq    %rbx,%r13
+       movq    %r9,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+       leaq    24(%rbp),%rbp
+       addq    %r14,%r8
+       movq    96(%rsi),%r12
+       movq    %rax,%r13
+       movq    %r8,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rbx,%r15
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rdx
+       movq    104(%rsi),%r12
+       movq    %r11,%r13
+       movq    %rdx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %rax,%rdi
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+       leaq    24(%rbp),%rbp
+       addq    %r14,%rcx
+       movq    112(%rsi),%r12
+       movq    %r10,%r13
+       movq    %rcx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r11,%r15
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+       leaq    8(%rbp),%rbp
+       addq    %r14,%rbx
+       movq    120(%rsi),%r12
+       movq    %r9,%r13
+       movq    %rbx,%r14
+       bswapq  %r12
+       rorq    $23,%r13
+       movq    %r10,%rdi
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+       leaq    24(%rbp),%rbp
+       jmp     .Lrounds_16_xx
+.align 16
+.Lrounds_16_xx:
+       movq    8(%rsp),%r13
+       movq    112(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    72(%rsp),%r12
+       addq    0(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+       movq    %r12,0(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+       leaq    8(%rbp),%rbp
+       movq    16(%rsp),%r13
+       movq    120(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    80(%rsp),%r12
+       addq    8(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+       movq    %r12,8(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+       leaq    24(%rbp),%rbp
+       movq    24(%rsp),%r13
+       movq    0(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    88(%rsp),%r12
+       addq    16(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+       movq    %r12,16(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+       leaq    8(%rbp),%rbp
+       movq    32(%rsp),%r13
+       movq    8(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    96(%rsp),%r12
+       addq    24(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+       movq    %r12,24(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+       leaq    24(%rbp),%rbp
+       movq    40(%rsp),%r13
+       movq    16(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    104(%rsp),%r12
+       addq    32(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+       movq    %r12,32(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+       leaq    8(%rbp),%rbp
+       movq    48(%rsp),%r13
+       movq    24(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    112(%rsp),%r12
+       addq    40(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+       movq    %r12,40(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+       leaq    24(%rbp),%rbp
+       movq    56(%rsp),%r13
+       movq    32(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    120(%rsp),%r12
+       addq    48(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+       movq    %r12,48(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+       leaq    8(%rbp),%rbp
+       movq    64(%rsp),%r13
+       movq    40(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    0(%rsp),%r12
+       addq    56(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+       movq    %r12,56(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+       leaq    24(%rbp),%rbp
+       movq    72(%rsp),%r13
+       movq    48(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rax
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    8(%rsp),%r12
+       addq    64(%rsp),%r12
+       movq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r14
+       rorq    $23,%r13
+       movq    %r9,%r15
+       xorq    %r8,%r13
+       rorq    $5,%r14
+       xorq    %r10,%r15
+       movq    %r12,64(%rsp)
+       xorq    %rax,%r14
+       andq    %r8,%r15
+       rorq    $4,%r13
+       addq    %r11,%r12
+       xorq    %r10,%r15
+       rorq    $6,%r14
+       xorq    %r8,%r13
+       addq    %r15,%r12
+       movq    %rax,%r15
+       addq    (%rbp),%r12
+       xorq    %rax,%r14
+       xorq    %rbx,%r15
+       rorq    $14,%r13
+       movq    %rbx,%r11
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r11
+       addq    %r12,%rdx
+       addq    %r12,%r11
+       leaq    8(%rbp),%rbp
+       movq    80(%rsp),%r13
+       movq    56(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r11
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    16(%rsp),%r12
+       addq    72(%rsp),%r12
+       movq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%r14
+       rorq    $23,%r13
+       movq    %r8,%rdi
+       xorq    %rdx,%r13
+       rorq    $5,%r14
+       xorq    %r9,%rdi
+       movq    %r12,72(%rsp)
+       xorq    %r11,%r14
+       andq    %rdx,%rdi
+       rorq    $4,%r13
+       addq    %r10,%r12
+       xorq    %r9,%rdi
+       rorq    $6,%r14
+       xorq    %rdx,%r13
+       addq    %rdi,%r12
+       movq    %r11,%rdi
+       addq    (%rbp),%r12
+       xorq    %r11,%r14
+       xorq    %rax,%rdi
+       rorq    $14,%r13
+       movq    %rax,%r10
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r10
+       addq    %r12,%rcx
+       addq    %r12,%r10
+       leaq    24(%rbp),%rbp
+       movq    88(%rsp),%r13
+       movq    64(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r10
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    24(%rsp),%r12
+       addq    80(%rsp),%r12
+       movq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r14
+       rorq    $23,%r13
+       movq    %rdx,%r15
+       xorq    %rcx,%r13
+       rorq    $5,%r14
+       xorq    %r8,%r15
+       movq    %r12,80(%rsp)
+       xorq    %r10,%r14
+       andq    %rcx,%r15
+       rorq    $4,%r13
+       addq    %r9,%r12
+       xorq    %r8,%r15
+       rorq    $6,%r14
+       xorq    %rcx,%r13
+       addq    %r15,%r12
+       movq    %r10,%r15
+       addq    (%rbp),%r12
+       xorq    %r10,%r14
+       xorq    %r11,%r15
+       rorq    $14,%r13
+       movq    %r11,%r9
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%r9
+       addq    %r12,%rbx
+       addq    %r12,%r9
+       leaq    8(%rbp),%rbp
+       movq    96(%rsp),%r13
+       movq    72(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r9
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    32(%rsp),%r12
+       addq    88(%rsp),%r12
+       movq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%r14
+       rorq    $23,%r13
+       movq    %rcx,%rdi
+       xorq    %rbx,%r13
+       rorq    $5,%r14
+       xorq    %rdx,%rdi
+       movq    %r12,88(%rsp)
+       xorq    %r9,%r14
+       andq    %rbx,%rdi
+       rorq    $4,%r13
+       addq    %r8,%r12
+       xorq    %rdx,%rdi
+       rorq    $6,%r14
+       xorq    %rbx,%r13
+       addq    %rdi,%r12
+       movq    %r9,%rdi
+       addq    (%rbp),%r12
+       xorq    %r9,%r14
+       xorq    %r10,%rdi
+       rorq    $14,%r13
+       movq    %r10,%r8
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%r8
+       addq    %r12,%rax
+       addq    %r12,%r8
+       leaq    24(%rbp),%rbp
+       movq    104(%rsp),%r13
+       movq    80(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%r8
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    40(%rsp),%r12
+       addq    96(%rsp),%r12
+       movq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r14
+       rorq    $23,%r13
+       movq    %rbx,%r15
+       xorq    %rax,%r13
+       rorq    $5,%r14
+       xorq    %rcx,%r15
+       movq    %r12,96(%rsp)
+       xorq    %r8,%r14
+       andq    %rax,%r15
+       rorq    $4,%r13
+       addq    %rdx,%r12
+       xorq    %rcx,%r15
+       rorq    $6,%r14
+       xorq    %rax,%r13
+       addq    %r15,%r12
+       movq    %r8,%r15
+       addq    (%rbp),%r12
+       xorq    %r8,%r14
+       xorq    %r9,%r15
+       rorq    $14,%r13
+       movq    %r9,%rdx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rdx
+       addq    %r12,%r11
+       addq    %r12,%rdx
+       leaq    8(%rbp),%rbp
+       movq    112(%rsp),%r13
+       movq    88(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rdx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    48(%rsp),%r12
+       addq    104(%rsp),%r12
+       movq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%r14
+       rorq    $23,%r13
+       movq    %rax,%rdi
+       xorq    %r11,%r13
+       rorq    $5,%r14
+       xorq    %rbx,%rdi
+       movq    %r12,104(%rsp)
+       xorq    %rdx,%r14
+       andq    %r11,%rdi
+       rorq    $4,%r13
+       addq    %rcx,%r12
+       xorq    %rbx,%rdi
+       rorq    $6,%r14
+       xorq    %r11,%r13
+       addq    %rdi,%r12
+       movq    %rdx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rdx,%r14
+       xorq    %r8,%rdi
+       rorq    $14,%r13
+       movq    %r8,%rcx
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rcx
+       addq    %r12,%r10
+       addq    %r12,%rcx
+       leaq    24(%rbp),%rbp
+       movq    120(%rsp),%r13
+       movq    96(%rsp),%r15
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rcx
+       movq    %r15,%r14
+       rorq    $42,%r15
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%r15
+       shrq    $6,%r14
+       rorq    $19,%r15
+       xorq    %r13,%r12
+       xorq    %r14,%r15
+       addq    56(%rsp),%r12
+       addq    112(%rsp),%r12
+       movq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r14
+       rorq    $23,%r13
+       movq    %r11,%r15
+       xorq    %r10,%r13
+       rorq    $5,%r14
+       xorq    %rax,%r15
+       movq    %r12,112(%rsp)
+       xorq    %rcx,%r14
+       andq    %r10,%r15
+       rorq    $4,%r13
+       addq    %rbx,%r12
+       xorq    %rax,%r15
+       rorq    $6,%r14
+       xorq    %r10,%r13
+       addq    %r15,%r12
+       movq    %rcx,%r15
+       addq    (%rbp),%r12
+       xorq    %rcx,%r14
+       xorq    %rdx,%r15
+       rorq    $14,%r13
+       movq    %rdx,%rbx
+       andq    %r15,%rdi
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %rdi,%rbx
+       addq    %r12,%r9
+       addq    %r12,%rbx
+       leaq    8(%rbp),%rbp
+       movq    0(%rsp),%r13
+       movq    104(%rsp),%rdi
+       movq    %r13,%r12
+       rorq    $7,%r13
+       addq    %r14,%rbx
+       movq    %rdi,%r14
+       rorq    $42,%rdi
+       xorq    %r12,%r13
+       shrq    $7,%r12
+       rorq    $1,%r13
+       xorq    %r14,%rdi
+       shrq    $6,%r14
+       rorq    $19,%rdi
+       xorq    %r13,%r12
+       xorq    %r14,%rdi
+       addq    64(%rsp),%r12
+       addq    120(%rsp),%r12
+       movq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%r14
+       rorq    $23,%r13
+       movq    %r10,%rdi
+       xorq    %r9,%r13
+       rorq    $5,%r14
+       xorq    %r11,%rdi
+       movq    %r12,120(%rsp)
+       xorq    %rbx,%r14
+       andq    %r9,%rdi
+       rorq    $4,%r13
+       addq    %rax,%r12
+       xorq    %r11,%rdi
+       rorq    $6,%r14
+       xorq    %r9,%r13
+       addq    %rdi,%r12
+       movq    %rbx,%rdi
+       addq    (%rbp),%r12
+       xorq    %rbx,%r14
+       xorq    %rcx,%rdi
+       rorq    $14,%r13
+       movq    %rcx,%rax
+       andq    %rdi,%r15
+       rorq    $28,%r14
+       addq    %r13,%r12
+       xorq    %r15,%rax
+       addq    %r12,%r8
+       addq    %r12,%rax
+       leaq    24(%rbp),%rbp
+       cmpb    $0,7(%rbp)
+       jnz     .Lrounds_16_xx
+       movq    128+0(%rsp),%rdi
+       addq    %r14,%rax
+       leaq    128(%rsi),%rsi
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+       cmpq    128+16(%rsp),%rsi
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_x64)
+
+ENTRY_ALIGN(zfs_sha512_transform_avx, 64)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       shlq    $4,%rdx
+       subq    $160,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       andq    $-64,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx:
+
+       vzeroupper
+       movq    0(%rdi),%rax
+       movq    8(%rdi),%rbx
+       movq    16(%rdi),%rcx
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Lloop_avx
+.align 16
+.Lloop_avx:
+       vmovdqa K512+1280(%rip),%xmm11
+       vmovdqu 0(%rsi),%xmm0
+       leaq    K512+128(%rip),%rbp
+       vmovdqu 16(%rsi),%xmm1
+       vmovdqu 32(%rsi),%xmm2
+       vpshufb %xmm11,%xmm0,%xmm0
+       vmovdqu 48(%rsi),%xmm3
+       vpshufb %xmm11,%xmm1,%xmm1
+       vmovdqu 64(%rsi),%xmm4
+       vpshufb %xmm11,%xmm2,%xmm2
+       vmovdqu 80(%rsi),%xmm5
+       vpshufb %xmm11,%xmm3,%xmm3
+       vmovdqu 96(%rsi),%xmm6
+       vpshufb %xmm11,%xmm4,%xmm4
+       vmovdqu 112(%rsi),%xmm7
+       vpshufb %xmm11,%xmm5,%xmm5
+       vpaddq  -128(%rbp),%xmm0,%xmm8
+       vpshufb %xmm11,%xmm6,%xmm6
+       vpaddq  -96(%rbp),%xmm1,%xmm9
+       vpshufb %xmm11,%xmm7,%xmm7
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       vpaddq  -32(%rbp),%xmm3,%xmm11
+       vmovdqa %xmm8,0(%rsp)
+       vpaddq  0(%rbp),%xmm4,%xmm8
+       vmovdqa %xmm9,16(%rsp)
+       vpaddq  32(%rbp),%xmm5,%xmm9
+       vmovdqa %xmm10,32(%rsp)
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       vmovdqa %xmm11,48(%rsp)
+       vpaddq  96(%rbp),%xmm7,%xmm11
+       vmovdqa %xmm8,64(%rsp)
+       movq    %rax,%r14
+       vmovdqa %xmm9,80(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %xmm10,96(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %xmm11,112(%rsp)
+       movq    %r8,%r13
+       jmp     .Lavx_00_47
+
+.align 16
+.Lavx_00_47:
+       addq    $256,%rbp
+       vpalignr        $8,%xmm0,%xmm1,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm4,%xmm5,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm0,%xmm0
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm7,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm7,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm0,%xmm0
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm7,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm0,%xmm0
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  -128(%rbp),%xmm0,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,0(%rsp)
+       vpalignr        $8,%xmm1,%xmm2,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm5,%xmm6,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm1,%xmm1
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm0,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm0,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm1,%xmm1
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm0,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm1,%xmm1
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  -96(%rbp),%xmm1,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,16(%rsp)
+       vpalignr        $8,%xmm2,%xmm3,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm6,%xmm7,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm2,%xmm2
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm1,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm1,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm2,%xmm2
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm1,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm2,%xmm2
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  -64(%rbp),%xmm2,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,32(%rsp)
+       vpalignr        $8,%xmm3,%xmm4,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm7,%xmm0,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm3,%xmm3
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm2,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm2,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm3,%xmm3
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm2,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm3,%xmm3
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  -32(%rbp),%xmm3,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,48(%rsp)
+       vpalignr        $8,%xmm4,%xmm5,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       vpalignr        $8,%xmm0,%xmm1,%xmm11
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       vpaddq  %xmm11,%xmm4,%xmm4
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm3,%xmm11
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       vpsllq  $3,%xmm3,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       vpaddq  %xmm8,%xmm4,%xmm4
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm3,%xmm9
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm4,%xmm4
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       vpaddq  0(%rbp),%xmm4,%xmm10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       vmovdqa %xmm10,64(%rsp)
+       vpalignr        $8,%xmm5,%xmm6,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       vpalignr        $8,%xmm1,%xmm2,%xmm11
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       vpaddq  %xmm11,%xmm5,%xmm5
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm4,%xmm11
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       vpsllq  $3,%xmm4,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       vpaddq  %xmm8,%xmm5,%xmm5
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm4,%xmm9
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm5,%xmm5
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       vpaddq  32(%rbp),%xmm5,%xmm10
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       vmovdqa %xmm10,80(%rsp)
+       vpalignr        $8,%xmm6,%xmm7,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       vpalignr        $8,%xmm2,%xmm3,%xmm11
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       vpaddq  %xmm11,%xmm6,%xmm6
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm5,%xmm11
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       vpsllq  $3,%xmm5,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       vpaddq  %xmm8,%xmm6,%xmm6
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm5,%xmm9
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm6,%xmm6
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       vpaddq  64(%rbp),%xmm6,%xmm10
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       vmovdqa %xmm10,96(%rsp)
+       vpalignr        $8,%xmm7,%xmm0,%xmm8
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       vpalignr        $8,%xmm3,%xmm4,%xmm11
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $1,%xmm8,%xmm10
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       vpaddq  %xmm11,%xmm7,%xmm7
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       vpsrlq  $7,%xmm8,%xmm11
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       vpsllq  $56,%xmm8,%xmm9
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       vpxor   %xmm10,%xmm11,%xmm8
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       vpsrlq  $7,%xmm10,%xmm10
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       vpsllq  $7,%xmm9,%xmm9
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       vpxor   %xmm10,%xmm8,%xmm8
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       vpsrlq  $6,%xmm6,%xmm11
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       vpxor   %xmm9,%xmm8,%xmm8
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       vpsllq  $3,%xmm6,%xmm10
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       vpaddq  %xmm8,%xmm7,%xmm7
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       vpsrlq  $19,%xmm6,%xmm9
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       vpxor   %xmm10,%xmm11,%xmm11
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       vpsllq  $42,%xmm10,%xmm10
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       vpxor   %xmm9,%xmm11,%xmm11
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       vpsrlq  $42,%xmm9,%xmm9
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       vpxor   %xmm10,%xmm11,%xmm11
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       vpxor   %xmm9,%xmm11,%xmm11
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       vpaddq  %xmm11,%xmm7,%xmm7
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       vpaddq  96(%rbp),%xmm7,%xmm10
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       vmovdqa %xmm10,112(%rsp)
+       cmpb    $0,135(%rbp)
+       jne     .Lavx_00_47
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    0(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    8(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    16(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    24(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    32(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    40(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    48(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    56(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rax
+       movq    %r9,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r8,%r13
+       xorq    %r10,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rax,%r14
+       andq    %r8,%r12
+       xorq    %r8,%r13
+       addq    64(%rsp),%r11
+       movq    %rax,%r15
+       xorq    %r10,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rbx,%r15
+       addq    %r12,%r11
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rax,%r14
+       addq    %r13,%r11
+       xorq    %rbx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r11,%rdx
+       addq    %rdi,%r11
+       movq    %rdx,%r13
+       addq    %r11,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r11
+       movq    %r8,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rdx,%r13
+       xorq    %r9,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r11,%r14
+       andq    %rdx,%r12
+       xorq    %rdx,%r13
+       addq    72(%rsp),%r10
+       movq    %r11,%rdi
+       xorq    %r9,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rax,%rdi
+       addq    %r12,%r10
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r11,%r14
+       addq    %r13,%r10
+       xorq    %rax,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r10,%rcx
+       addq    %r15,%r10
+       movq    %rcx,%r13
+       addq    %r10,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r10
+       movq    %rdx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rcx,%r13
+       xorq    %r8,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r10,%r14
+       andq    %rcx,%r12
+       xorq    %rcx,%r13
+       addq    80(%rsp),%r9
+       movq    %r10,%r15
+       xorq    %r8,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r11,%r15
+       addq    %r12,%r9
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r10,%r14
+       addq    %r13,%r9
+       xorq    %r11,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %r9,%rbx
+       addq    %rdi,%r9
+       movq    %rbx,%r13
+       addq    %r9,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r9
+       movq    %rcx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rbx,%r13
+       xorq    %rdx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r9,%r14
+       andq    %rbx,%r12
+       xorq    %rbx,%r13
+       addq    88(%rsp),%r8
+       movq    %r9,%rdi
+       xorq    %rdx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r10,%rdi
+       addq    %r12,%r8
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %r9,%r14
+       addq    %r13,%r8
+       xorq    %r10,%r15
+       shrdq   $28,%r14,%r14
+       addq    %r8,%rax
+       addq    %r15,%r8
+       movq    %rax,%r13
+       addq    %r8,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%r8
+       movq    %rbx,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %rax,%r13
+       xorq    %rcx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %r8,%r14
+       andq    %rax,%r12
+       xorq    %rax,%r13
+       addq    96(%rsp),%rdx
+       movq    %r8,%r15
+       xorq    %rcx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r9,%r15
+       addq    %r12,%rdx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %r8,%r14
+       addq    %r13,%rdx
+       xorq    %r9,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rdx,%r11
+       addq    %rdi,%rdx
+       movq    %r11,%r13
+       addq    %rdx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rdx
+       movq    %rax,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r11,%r13
+       xorq    %rbx,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rdx,%r14
+       andq    %r11,%r12
+       xorq    %r11,%r13
+       addq    104(%rsp),%rcx
+       movq    %rdx,%rdi
+       xorq    %rbx,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %r8,%rdi
+       addq    %r12,%rcx
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rdx,%r14
+       addq    %r13,%rcx
+       xorq    %r8,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rcx,%r10
+       addq    %r15,%rcx
+       movq    %r10,%r13
+       addq    %rcx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rcx
+       movq    %r11,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r10,%r13
+       xorq    %rax,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rcx,%r14
+       andq    %r10,%r12
+       xorq    %r10,%r13
+       addq    112(%rsp),%rbx
+       movq    %rcx,%r15
+       xorq    %rax,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rdx,%r15
+       addq    %r12,%rbx
+       shrdq   $14,%r13,%r13
+       andq    %r15,%rdi
+       xorq    %rcx,%r14
+       addq    %r13,%rbx
+       xorq    %rdx,%rdi
+       shrdq   $28,%r14,%r14
+       addq    %rbx,%r9
+       addq    %rdi,%rbx
+       movq    %r9,%r13
+       addq    %rbx,%r14
+       shrdq   $23,%r13,%r13
+       movq    %r14,%rbx
+       movq    %r10,%r12
+       shrdq   $5,%r14,%r14
+       xorq    %r9,%r13
+       xorq    %r11,%r12
+       shrdq   $4,%r13,%r13
+       xorq    %rbx,%r14
+       andq    %r9,%r12
+       xorq    %r9,%r13
+       addq    120(%rsp),%rax
+       movq    %rbx,%rdi
+       xorq    %r11,%r12
+       shrdq   $6,%r14,%r14
+       xorq    %rcx,%rdi
+       addq    %r12,%rax
+       shrdq   $14,%r13,%r13
+       andq    %rdi,%r15
+       xorq    %rbx,%r14
+       addq    %r13,%rax
+       xorq    %rcx,%r15
+       shrdq   $28,%r14,%r14
+       addq    %rax,%r8
+       addq    %r15,%rax
+       movq    %r8,%r13
+       addq    %rax,%r14
+       movq    128+0(%rsp),%rdi
+       movq    %r14,%rax
+
+       addq    0(%rdi),%rax
+       leaq    128(%rsi),%rsi
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+       jb      .Lloop_avx
+
+       movq    152(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_avx)
+
+ENTRY_ALIGN(zfs_sha512_transform_avx2, 64)
+.cfi_startproc
+       ENDBR
+       movq    %rsp,%rax
+.cfi_def_cfa_register  %rax
+       pushq   %rbx
+.cfi_offset    %rbx,-16
+       pushq   %rbp
+.cfi_offset    %rbp,-24
+       pushq   %r12
+.cfi_offset    %r12,-32
+       pushq   %r13
+.cfi_offset    %r13,-40
+       pushq   %r14
+.cfi_offset    %r14,-48
+       pushq   %r15
+.cfi_offset    %r15,-56
+       subq    $1312,%rsp
+       shlq    $4,%rdx
+       andq    $-2048,%rsp
+       leaq    (%rsi,%rdx,8),%rdx
+       addq    $1152,%rsp
+       movq    %rdi,128+0(%rsp)
+       movq    %rsi,128+8(%rsp)
+       movq    %rdx,128+16(%rsp)
+       movq    %rax,152(%rsp)
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+.Lprologue_avx2:
+
+       vzeroupper
+       subq    $-128,%rsi
+       movq    0(%rdi),%rax
+       movq    %rsi,%r12
+       movq    8(%rdi),%rbx
+       cmpq    %rdx,%rsi
+       movq    16(%rdi),%rcx
+       cmoveq  %rsp,%r12
+       movq    24(%rdi),%rdx
+       movq    32(%rdi),%r8
+       movq    40(%rdi),%r9
+       movq    48(%rdi),%r10
+       movq    56(%rdi),%r11
+       jmp     .Loop_avx2
+.align 16
+.Loop_avx2:
+       vmovdqu -128(%rsi),%xmm0
+       vmovdqu -128+16(%rsi),%xmm1
+       vmovdqu -128+32(%rsi),%xmm2
+       leaq    K512+128(%rip),%rbp
+       vmovdqu -128+48(%rsi),%xmm3
+       vmovdqu -128+64(%rsi),%xmm4
+       vmovdqu -128+80(%rsi),%xmm5
+       vmovdqu -128+96(%rsi),%xmm6
+       vmovdqu -128+112(%rsi),%xmm7
+
+       vmovdqa 1152(%rbp),%ymm10
+       vinserti128     $1,(%r12),%ymm0,%ymm0
+       vinserti128     $1,16(%r12),%ymm1,%ymm1
+       vpshufb %ymm10,%ymm0,%ymm0
+       vinserti128     $1,32(%r12),%ymm2,%ymm2
+       vpshufb %ymm10,%ymm1,%ymm1
+       vinserti128     $1,48(%r12),%ymm3,%ymm3
+       vpshufb %ymm10,%ymm2,%ymm2
+       vinserti128     $1,64(%r12),%ymm4,%ymm4
+       vpshufb %ymm10,%ymm3,%ymm3
+       vinserti128     $1,80(%r12),%ymm5,%ymm5
+       vpshufb %ymm10,%ymm4,%ymm4
+       vinserti128     $1,96(%r12),%ymm6,%ymm6
+       vpshufb %ymm10,%ymm5,%ymm5
+       vinserti128     $1,112(%r12),%ymm7,%ymm7
+
+       vpaddq  -128(%rbp),%ymm0,%ymm8
+       vpshufb %ymm10,%ymm6,%ymm6
+       vpaddq  -96(%rbp),%ymm1,%ymm9
+       vpshufb %ymm10,%ymm7,%ymm7
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       vpaddq  -32(%rbp),%ymm3,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       vpaddq  0(%rbp),%ymm4,%ymm8
+       vmovdqa %ymm9,32(%rsp)
+       vpaddq  32(%rbp),%ymm5,%ymm9
+       vmovdqa %ymm10,64(%rsp)
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       vmovdqa %ymm11,96(%rsp)
+
+       movq    152(%rsp),%rdi
+.cfi_def_cfa   %rdi,8
+       leaq    -128(%rsp),%rsp
+
+
+
+       movq    %rdi,-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       vpaddq  96(%rbp),%ymm7,%ymm11
+       vmovdqa %ymm8,0(%rsp)
+       xorq    %r14,%r14
+       vmovdqa %ymm9,32(%rsp)
+       movq    %rbx,%rdi
+       vmovdqa %ymm10,64(%rsp)
+       xorq    %rcx,%rdi
+       vmovdqa %ymm11,96(%rsp)
+       movq    %r9,%r12
+       addq    $32*8,%rbp
+       jmp     .Lavx2_00_47
+
+.align 16
+.Lavx2_00_47:
+       leaq    -128(%rsp),%rsp
+.cfi_escape    0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+       pushq   128-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+       leaq    8(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       vpalignr        $8,%ymm0,%ymm1,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm4,%ymm5,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm0,%ymm0
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm7,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm7,%ymm10
+       vpaddq  %ymm8,%ymm0,%ymm0
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm7,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm0,%ymm0
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  -128(%rbp),%ymm0,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm1,%ymm2,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm5,%ymm6,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm1,%ymm1
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm0,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm0,%ymm10
+       vpaddq  %ymm8,%ymm1,%ymm1
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm0,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm1,%ymm1
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  -96(%rbp),%ymm1,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm2,%ymm3,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm6,%ymm7,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm2,%ymm2
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm1,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm1,%ymm10
+       vpaddq  %ymm8,%ymm2,%ymm2
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm1,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm2,%ymm2
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  -64(%rbp),%ymm2,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm3,%ymm4,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm7,%ymm0,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm3,%ymm3
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm2,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm2,%ymm10
+       vpaddq  %ymm8,%ymm3,%ymm3
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm2,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm3,%ymm3
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  -32(%rbp),%ymm3,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    -128(%rsp),%rsp
+.cfi_escape    0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08
+
+       pushq   128-8(%rsp)
+.cfi_escape    0x0f,0x05,0x77,0x00,0x06,0x23,0x08
+       leaq    8(%rsp),%rsp
+.cfi_escape    0x0f,0x05,0x77,0x78,0x06,0x23,0x08
+       vpalignr        $8,%ymm4,%ymm5,%ymm8
+       addq    0+256(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       vpalignr        $8,%ymm0,%ymm1,%ymm11
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       vpaddq  %ymm11,%ymm4,%ymm4
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       vpsrlq  $6,%ymm3,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       vpsllq  $3,%ymm3,%ymm10
+       vpaddq  %ymm8,%ymm4,%ymm4
+       addq    8+256(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       vpsrlq  $19,%ymm3,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       vpaddq  %ymm11,%ymm4,%ymm4
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       vpaddq  0(%rbp),%ymm4,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       vmovdqa %ymm10,0(%rsp)
+       vpalignr        $8,%ymm5,%ymm6,%ymm8
+       addq    32+256(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       vpalignr        $8,%ymm1,%ymm2,%ymm11
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       vpaddq  %ymm11,%ymm5,%ymm5
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       vpsrlq  $6,%ymm4,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       vpsllq  $3,%ymm4,%ymm10
+       vpaddq  %ymm8,%ymm5,%ymm5
+       addq    40+256(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       vpsrlq  $19,%ymm4,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       vpaddq  %ymm11,%ymm5,%ymm5
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       vpaddq  32(%rbp),%ymm5,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       vmovdqa %ymm10,32(%rsp)
+       vpalignr        $8,%ymm6,%ymm7,%ymm8
+       addq    64+256(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       vpalignr        $8,%ymm2,%ymm3,%ymm11
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       vpaddq  %ymm11,%ymm6,%ymm6
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       vpsrlq  $6,%ymm5,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       vpsllq  $3,%ymm5,%ymm10
+       vpaddq  %ymm8,%ymm6,%ymm6
+       addq    72+256(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       vpsrlq  $19,%ymm5,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       vpaddq  %ymm11,%ymm6,%ymm6
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       vpaddq  64(%rbp),%ymm6,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       vmovdqa %ymm10,64(%rsp)
+       vpalignr        $8,%ymm7,%ymm0,%ymm8
+       addq    96+256(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       vpalignr        $8,%ymm3,%ymm4,%ymm11
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       vpsrlq  $1,%ymm8,%ymm10
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       vpaddq  %ymm11,%ymm7,%ymm7
+       vpsrlq  $7,%ymm8,%ymm11
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       vpsllq  $56,%ymm8,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm8
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       vpsrlq  $7,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm8,%ymm8
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       vpsllq  $7,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm8,%ymm8
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       vpsrlq  $6,%ymm6,%ymm11
+       vpxor   %ymm9,%ymm8,%ymm8
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       vpsllq  $3,%ymm6,%ymm10
+       vpaddq  %ymm8,%ymm7,%ymm7
+       addq    104+256(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       vpsrlq  $19,%ymm6,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       vpsllq  $42,%ymm10,%ymm10
+       vpxor   %ymm9,%ymm11,%ymm11
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       vpsrlq  $42,%ymm9,%ymm9
+       vpxor   %ymm10,%ymm11,%ymm11
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       vpxor   %ymm9,%ymm11,%ymm11
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       vpaddq  %ymm11,%ymm7,%ymm7
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       vpaddq  96(%rbp),%ymm7,%ymm10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       vmovdqa %ymm10,96(%rsp)
+       leaq    256(%rbp),%rbp
+       cmpb    $0,-121(%rbp)
+       jne     .Lavx2_00_47
+       addq    0+128(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+128(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+128(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+128(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+128(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+128(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+128(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+128(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       addq    0(%rsp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8(%rsp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32(%rsp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40(%rsp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64(%rsp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72(%rsp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96(%rsp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104(%rsp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rbp
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       addq    48(%rdi),%r10
+       addq    56(%rdi),%r11
+
+       movq    %rax,0(%rdi)
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       cmpq    144(%rbp),%rsi
+       je      .Ldone_avx2
+
+       xorq    %r14,%r14
+       movq    %rbx,%rdi
+       xorq    %rcx,%rdi
+       movq    %r9,%r12
+       jmp     .Lower_avx2
+.align 16
+.Lower_avx2:
+       addq    0+16(%rbp),%r11
+       andq    %r8,%r12
+       rorxq   $41,%r8,%r13
+       rorxq   $18,%r8,%r15
+       leaq    (%rax,%r14,1),%rax
+       leaq    (%r11,%r12,1),%r11
+       andnq   %r10,%r8,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r8,%r14
+       leaq    (%r11,%r12,1),%r11
+       xorq    %r14,%r13
+       movq    %rax,%r15
+       rorxq   $39,%rax,%r12
+       leaq    (%r11,%r13,1),%r11
+       xorq    %rbx,%r15
+       rorxq   $34,%rax,%r14
+       rorxq   $28,%rax,%r13
+       leaq    (%rdx,%r11,1),%rdx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rbx,%rdi
+       xorq    %r13,%r14
+       leaq    (%r11,%rdi,1),%r11
+       movq    %r8,%r12
+       addq    8+16(%rbp),%r10
+       andq    %rdx,%r12
+       rorxq   $41,%rdx,%r13
+       rorxq   $18,%rdx,%rdi
+       leaq    (%r11,%r14,1),%r11
+       leaq    (%r10,%r12,1),%r10
+       andnq   %r9,%rdx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rdx,%r14
+       leaq    (%r10,%r12,1),%r10
+       xorq    %r14,%r13
+       movq    %r11,%rdi
+       rorxq   $39,%r11,%r12
+       leaq    (%r10,%r13,1),%r10
+       xorq    %rax,%rdi
+       rorxq   $34,%r11,%r14
+       rorxq   $28,%r11,%r13
+       leaq    (%rcx,%r10,1),%rcx
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rax,%r15
+       xorq    %r13,%r14
+       leaq    (%r10,%r15,1),%r10
+       movq    %rdx,%r12
+       addq    32+16(%rbp),%r9
+       andq    %rcx,%r12
+       rorxq   $41,%rcx,%r13
+       rorxq   $18,%rcx,%r15
+       leaq    (%r10,%r14,1),%r10
+       leaq    (%r9,%r12,1),%r9
+       andnq   %r8,%rcx,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rcx,%r14
+       leaq    (%r9,%r12,1),%r9
+       xorq    %r14,%r13
+       movq    %r10,%r15
+       rorxq   $39,%r10,%r12
+       leaq    (%r9,%r13,1),%r9
+       xorq    %r11,%r15
+       rorxq   $34,%r10,%r14
+       rorxq   $28,%r10,%r13
+       leaq    (%rbx,%r9,1),%rbx
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r11,%rdi
+       xorq    %r13,%r14
+       leaq    (%r9,%rdi,1),%r9
+       movq    %rcx,%r12
+       addq    40+16(%rbp),%r8
+       andq    %rbx,%r12
+       rorxq   $41,%rbx,%r13
+       rorxq   $18,%rbx,%rdi
+       leaq    (%r9,%r14,1),%r9
+       leaq    (%r8,%r12,1),%r8
+       andnq   %rdx,%rbx,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%rbx,%r14
+       leaq    (%r8,%r12,1),%r8
+       xorq    %r14,%r13
+       movq    %r9,%rdi
+       rorxq   $39,%r9,%r12
+       leaq    (%r8,%r13,1),%r8
+       xorq    %r10,%rdi
+       rorxq   $34,%r9,%r14
+       rorxq   $28,%r9,%r13
+       leaq    (%rax,%r8,1),%rax
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r10,%r15
+       xorq    %r13,%r14
+       leaq    (%r8,%r15,1),%r8
+       movq    %rbx,%r12
+       addq    64+16(%rbp),%rdx
+       andq    %rax,%r12
+       rorxq   $41,%rax,%r13
+       rorxq   $18,%rax,%r15
+       leaq    (%r8,%r14,1),%r8
+       leaq    (%rdx,%r12,1),%rdx
+       andnq   %rcx,%rax,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%rax,%r14
+       leaq    (%rdx,%r12,1),%rdx
+       xorq    %r14,%r13
+       movq    %r8,%r15
+       rorxq   $39,%r8,%r12
+       leaq    (%rdx,%r13,1),%rdx
+       xorq    %r9,%r15
+       rorxq   $34,%r8,%r14
+       rorxq   $28,%r8,%r13
+       leaq    (%r11,%rdx,1),%r11
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %r9,%rdi
+       xorq    %r13,%r14
+       leaq    (%rdx,%rdi,1),%rdx
+       movq    %rax,%r12
+       addq    72+16(%rbp),%rcx
+       andq    %r11,%r12
+       rorxq   $41,%r11,%r13
+       rorxq   $18,%r11,%rdi
+       leaq    (%rdx,%r14,1),%rdx
+       leaq    (%rcx,%r12,1),%rcx
+       andnq   %rbx,%r11,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r11,%r14
+       leaq    (%rcx,%r12,1),%rcx
+       xorq    %r14,%r13
+       movq    %rdx,%rdi
+       rorxq   $39,%rdx,%r12
+       leaq    (%rcx,%r13,1),%rcx
+       xorq    %r8,%rdi
+       rorxq   $34,%rdx,%r14
+       rorxq   $28,%rdx,%r13
+       leaq    (%r10,%rcx,1),%r10
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %r8,%r15
+       xorq    %r13,%r14
+       leaq    (%rcx,%r15,1),%rcx
+       movq    %r11,%r12
+       addq    96+16(%rbp),%rbx
+       andq    %r10,%r12
+       rorxq   $41,%r10,%r13
+       rorxq   $18,%r10,%r15
+       leaq    (%rcx,%r14,1),%rcx
+       leaq    (%rbx,%r12,1),%rbx
+       andnq   %rax,%r10,%r12
+       xorq    %r15,%r13
+       rorxq   $14,%r10,%r14
+       leaq    (%rbx,%r12,1),%rbx
+       xorq    %r14,%r13
+       movq    %rcx,%r15
+       rorxq   $39,%rcx,%r12
+       leaq    (%rbx,%r13,1),%rbx
+       xorq    %rdx,%r15
+       rorxq   $34,%rcx,%r14
+       rorxq   $28,%rcx,%r13
+       leaq    (%r9,%rbx,1),%r9
+       andq    %r15,%rdi
+       xorq    %r12,%r14
+       xorq    %rdx,%rdi
+       xorq    %r13,%r14
+       leaq    (%rbx,%rdi,1),%rbx
+       movq    %r10,%r12
+       addq    104+16(%rbp),%rax
+       andq    %r9,%r12
+       rorxq   $41,%r9,%r13
+       rorxq   $18,%r9,%rdi
+       leaq    (%rbx,%r14,1),%rbx
+       leaq    (%rax,%r12,1),%rax
+       andnq   %r11,%r9,%r12
+       xorq    %rdi,%r13
+       rorxq   $14,%r9,%r14
+       leaq    (%rax,%r12,1),%rax
+       xorq    %r14,%r13
+       movq    %rbx,%rdi
+       rorxq   $39,%rbx,%r12
+       leaq    (%rax,%r13,1),%rax
+       xorq    %rcx,%rdi
+       rorxq   $34,%rbx,%r14
+       rorxq   $28,%rbx,%r13
+       leaq    (%r8,%rax,1),%r8
+       andq    %rdi,%r15
+       xorq    %r12,%r14
+       xorq    %rcx,%r15
+       xorq    %r13,%r14
+       leaq    (%rax,%r15,1),%rax
+       movq    %r9,%r12
+       leaq    -128(%rbp),%rbp
+       cmpq    %rsp,%rbp
+       jae     .Lower_avx2
+
+       movq    1280(%rsp),%rdi
+       addq    %r14,%rax
+
+       leaq    1152(%rsp),%rsp
+
+.cfi_escape    0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08
+
+       addq    0(%rdi),%rax
+       addq    8(%rdi),%rbx
+       addq    16(%rdi),%rcx
+       addq    24(%rdi),%rdx
+       addq    32(%rdi),%r8
+       addq    40(%rdi),%r9
+       leaq    256(%rsi),%rsi
+       addq    48(%rdi),%r10
+       movq    %rsi,%r12
+       addq    56(%rdi),%r11
+       cmpq    128+16(%rsp),%rsi
+
+       movq    %rax,0(%rdi)
+       cmoveq  %rsp,%r12
+       movq    %rbx,8(%rdi)
+       movq    %rcx,16(%rdi)
+       movq    %rdx,24(%rdi)
+       movq    %r8,32(%rdi)
+       movq    %r9,40(%rdi)
+       movq    %r10,48(%rdi)
+       movq    %r11,56(%rdi)
+
+       jbe     .Loop_avx2
+       leaq    (%rsp),%rbp
+
+.cfi_escape    0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08
+
+.Ldone_avx2:
+       movq    152(%rbp),%rsi
+.cfi_def_cfa   %rsi,8
+       vzeroupper
+       movq    -48(%rsi),%r15
+.cfi_restore   %r15
+       movq    -40(%rsi),%r14
+.cfi_restore   %r14
+       movq    -32(%rsi),%r13
+.cfi_restore   %r13
+       movq    -24(%rsi),%r12
+.cfi_restore   %r12
+       movq    -16(%rsi),%rbp
+.cfi_restore   %rbp
+       movq    -8(%rsi),%rbx
+.cfi_restore   %rbx
+       leaq    (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
+.Lepilogue_avx2:
+       RET
+.cfi_endproc
+SET_SIZE(zfs_sha512_transform_avx2)
+
+#if defined(__ELF__)
+       .section .note.GNU-stack,"",%progbits
+#endif
+#endif
diff --git a/module/icp/include/generic_impl.c b/module/icp/include/generic_impl.c
new file mode 100644 (file)
index 0000000..16f802c
--- /dev/null
@@ -0,0 +1,233 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010 Oracle and/or its affiliates.
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+/*
+ * This file gets included by c files for implementing the full set
+ * of zfs_impl.h defines.
+ *
+ * It's ment for easier maintaining multiple implementations of
+ * algorithms. Look into blake3_impl.c, sha256_impl.c or sha512_impl.c
+ * for reference.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_impl.h>
+
+/* Two default implementations */
+#define        IMPL_FASTEST    (UINT32_MAX)
+#define        IMPL_CYCLE      (UINT32_MAX - 1)
+
+#define        IMPL_READ(i)    (*(volatile uint32_t *) &(i))
+
+/* Implementation that contains the fastest method */
+static IMPL_OPS_T generic_fastest_impl = {
+       .name = "fastest"
+};
+
+/* Hold all supported implementations */
+static const IMPL_OPS_T *generic_supp_impls[ARRAY_SIZE(IMPL_ARRAY)];
+static uint32_t generic_supp_impls_cnt = 0;
+
+/* Currently selected implementation */
+static uint32_t generic_impl_chosen = IMPL_FASTEST;
+
+static struct generic_impl_selector {
+       const char *name;
+       uint32_t sel;
+} generic_impl_selectors[] = {
+       { "cycle",      IMPL_CYCLE },
+       { "fastest",    IMPL_FASTEST }
+};
+
+/* check the supported implementations */
+static void
+generic_impl_init(void)
+{
+       int i, c;
+
+       /* init only once */
+       if (likely(generic_supp_impls_cnt != 0))
+               return;
+
+       /* Move supported implementations into generic_supp_impls */
+       for (i = 0, c = 0; i < ARRAY_SIZE(IMPL_ARRAY); i++) {
+               const IMPL_OPS_T *impl = IMPL_ARRAY[i];
+
+               if (impl->is_supported && impl->is_supported())
+                       generic_supp_impls[c++] = impl;
+       }
+       generic_supp_impls_cnt = c;
+
+       /* first init generic impl, may be changed via set_fastest() */
+       memcpy(&generic_fastest_impl, generic_supp_impls[0],
+           sizeof (generic_fastest_impl));
+}
+
+/* get number of supported implementations */
+static uint32_t
+generic_impl_getcnt(void)
+{
+       generic_impl_init();
+       return (generic_supp_impls_cnt);
+}
+
+/* get id of selected implementation */
+static uint32_t
+generic_impl_getid(void)
+{
+       generic_impl_init();
+       return (IMPL_READ(generic_impl_chosen));
+}
+
+/* get name of selected implementation */
+static const char *
+generic_impl_getname(void)
+{
+       uint32_t impl = IMPL_READ(generic_impl_chosen);
+
+       generic_impl_init();
+       switch (impl) {
+       case IMPL_FASTEST:
+               return ("fastest");
+       case IMPL_CYCLE:
+               return ("cycle");
+       default:
+               return (generic_supp_impls[impl]->name);
+       }
+}
+
+/* set implementation by id */
+static void
+generic_impl_setid(uint32_t id)
+{
+       generic_impl_init();
+       switch (id) {
+       case IMPL_FASTEST:
+               atomic_swap_32(&generic_impl_chosen, IMPL_FASTEST);
+               break;
+       case IMPL_CYCLE:
+               atomic_swap_32(&generic_impl_chosen, IMPL_CYCLE);
+               break;
+       default:
+               ASSERT3U(id, <, generic_supp_impls_cnt);
+               atomic_swap_32(&generic_impl_chosen, id);
+               break;
+       }
+}
+
+/* set implementation by name */
+static int
+generic_impl_setname(const char *val)
+{
+       uint32_t impl = IMPL_READ(generic_impl_chosen);
+       size_t val_len;
+       int i, err = -EINVAL;
+
+       generic_impl_init();
+       val_len = strlen(val);
+       while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
+               val_len--;
+
+       /* check mandatory implementations */
+       for (i = 0; i < ARRAY_SIZE(generic_impl_selectors); i++) {
+               const char *name = generic_impl_selectors[i].name;
+
+               if (val_len == strlen(name) &&
+                   strncmp(val, name, val_len) == 0) {
+                       impl = generic_impl_selectors[i].sel;
+                       err = 0;
+                       break;
+               }
+       }
+
+       /* check all supported implementations */
+       if (err != 0) {
+               for (i = 0; i < generic_supp_impls_cnt; i++) {
+                       const char *name = generic_supp_impls[i]->name;
+
+                       if (val_len == strlen(name) &&
+                           strncmp(val, name, val_len) == 0) {
+                               impl = i;
+                               err = 0;
+                               break;
+                       }
+               }
+       }
+
+       if (err == 0) {
+               atomic_swap_32(&generic_impl_chosen, impl);
+       }
+
+       return (err);
+}
+
+/* setup id as fastest implementation */
+static void
+generic_impl_set_fastest(uint32_t id)
+{
+       generic_impl_init();
+       memcpy(&generic_fastest_impl, generic_supp_impls[id],
+           sizeof (generic_fastest_impl));
+}
+
+/* return impl iterating functions */
+const zfs_impl_t ZFS_IMPL_OPS = {
+       .name = IMPL_NAME,
+       .getcnt = generic_impl_getcnt,
+       .getid = generic_impl_getid,
+       .getname = generic_impl_getname,
+       .set_fastest = generic_impl_set_fastest,
+       .setid = generic_impl_setid,
+       .setname = generic_impl_setname
+};
+
+/* get impl ops_t of selected implementation */
+const IMPL_OPS_T *
+IMPL_GET_OPS(void)
+{
+       const IMPL_OPS_T *ops = NULL;
+       uint32_t idx, impl = IMPL_READ(generic_impl_chosen);
+       static uint32_t cycle_count = 0;
+
+       generic_impl_init();
+       switch (impl) {
+       case IMPL_FASTEST:
+               ops = &generic_fastest_impl;
+               break;
+       case IMPL_CYCLE:
+               idx = (++cycle_count) % generic_supp_impls_cnt;
+               ops = generic_supp_impls[idx];
+               break;
+       default:
+               ASSERT3U(impl, <, generic_supp_impls_cnt);
+               ops = generic_supp_impls[impl];
+               break;
+       }
+
+       ASSERT3P(ops, !=, NULL);
+       return (ops);
+}
index 0e89747eefd168cc3edef804198970869d2bfb47..9a1bd38f1a77ed622f07ced361d2088735e75889 100644 (file)
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
 #ifndef        _SHA2_IMPL_H
 extern "C" {
 #endif
 
+/* transform function definition */
+typedef void (*sha256_f)(uint32_t state[8], const void *data, size_t blks);
+typedef void (*sha512_f)(uint64_t state[8], const void *data, size_t blks);
+
+/* needed for checking valid implementations */
+typedef boolean_t (*sha2_is_supported_f)(void);
+
+typedef struct {
+       const char *name;
+       sha256_f transform;
+       sha2_is_supported_f is_supported;
+} sha256_ops_t;
+
+typedef struct {
+       const char *name;
+       sha512_f transform;
+       sha2_is_supported_f is_supported;
+} sha512_ops_t;
+
+extern const sha256_ops_t *sha256_get_ops(void);
+extern const sha512_ops_t *sha512_get_ops(void);
+
 typedef enum {
        SHA1_TYPE,
        SHA256_TYPE,
index a58f0982c8c0301ea5f9d5a7e6db093ec2ed001e..f068951b07f561c5c896846daa5e01a99c8c0622 100644 (file)
@@ -28,7 +28,6 @@
 #include <sys/crypto/common.h>
 #include <sys/crypto/spi.h>
 #include <sys/crypto/icp.h>
-#define        _SHA2_IMPL
 #include <sys/sha2.h>
 #include <sha2/sha2_impl.h>
 
diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c
deleted file mode 100644 (file)
index 445d82e..0000000
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or https://opensource.org/licenses/CDDL-1.0.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright 2013 Saso Kiselkov. All rights reserved.
- * Copyright (c) 2016 by Delphix. All rights reserved.
- */
-#include <sys/zfs_context.h>
-#include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/sha2.h>
-#include <sys/abd.h>
-#include <sys/qat.h>
-
-static int
-sha_incremental(void *buf, size_t size, void *arg)
-{
-       SHA2_CTX *ctx = arg;
-       SHA2Update(ctx, buf, size);
-       return (0);
-}
-
-void
-abd_checksum_SHA256(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-       (void) ctx_template;
-       int ret;
-       SHA2_CTX ctx;
-       zio_cksum_t tmp;
-
-       if (qat_checksum_use_accel(size)) {
-               uint8_t *buf = abd_borrow_buf_copy(abd, size);
-               ret = qat_checksum(ZIO_CHECKSUM_SHA256, buf, size, &tmp);
-               abd_return_buf(abd, buf, size);
-               if (ret == CPA_STATUS_SUCCESS)
-                       goto bswap;
-
-               /* If the hardware implementation fails fall back to software */
-       }
-
-       SHA2Init(SHA256, &ctx);
-       (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
-       SHA2Final(&tmp, &ctx);
-
-bswap:
-       /*
-        * A prior implementation of this function had a
-        * private SHA256 implementation always wrote things out in
-        * Big Endian and there wasn't a byteswap variant of it.
-        * To preserve on disk compatibility we need to force that
-        * behavior.
-        */
-       zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
-       zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
-       zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
-       zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
-}
-
-void
-abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-       (void) ctx_template;
-       SHA2_CTX        ctx;
-
-       SHA2Init(SHA512_256, &ctx);
-       (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
-       SHA2Final(zcp, &ctx);
-}
-
-void
-abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
-    const void *ctx_template, zio_cksum_t *zcp)
-{
-       zio_cksum_t     tmp;
-
-       abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
-       zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
-       zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
-       zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
-       zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
-}
diff --git a/module/zfs/sha2_zfs.c b/module/zfs/sha2_zfs.c
new file mode 100644 (file)
index 0000000..872b1e5
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/sha2.h>
+#include <sys/abd.h>
+#include <sys/qat.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+       SHA2_CTX *ctx = arg;
+       SHA2Update(ctx, buf, size);
+       return (0);
+}
+
+void
+abd_checksum_sha256(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+       (void) ctx_template;
+       int ret;
+       SHA2_CTX ctx;
+       zio_cksum_t tmp;
+
+       if (qat_checksum_use_accel(size)) {
+               uint8_t *buf = abd_borrow_buf_copy(abd, size);
+               ret = qat_checksum(ZIO_CHECKSUM_SHA256, buf, size, &tmp);
+               abd_return_buf(abd, buf, size);
+               if (ret == CPA_STATUS_SUCCESS)
+                       goto bswap;
+
+               /* If the hardware implementation fails fall back to software */
+       }
+
+       SHA2Init(SHA256, &ctx);
+       (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+       SHA2Final(&tmp, &ctx);
+
+bswap:
+       /*
+        * A prior implementation of this function had a
+        * private SHA256 implementation always wrote things out in
+        * Big Endian and there wasn't a byteswap variant of it.
+        * To preserve on disk compatibility we need to force that
+        * behavior.
+        */
+       zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+       zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+       zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+       zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
+}
+
+void
+abd_checksum_sha512_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+       (void) ctx_template;
+       SHA2_CTX        ctx;
+
+       SHA2Init(SHA512_256, &ctx);
+       (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+       SHA2Final(zcp, &ctx);
+}
+
+void
+abd_checksum_sha512_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+       zio_cksum_t     tmp;
+
+       abd_checksum_sha512_native(abd, size, ctx_template, &tmp);
+       zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+       zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+       zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+       zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
index 91247f29278f01df8e128dcfdb09b265361881e6..acedeab7a163699ef2963c94f06efde0be8fda7c 100644 (file)
  * Copyright (c) 2021-2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
-#include <sys/types.h>
-#include <sys/spa.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_chksum.h>
+#include <sys/zfs_impl.h>
 
 #include <sys/blake3.h>
+#include <sys/sha2.h>
 
 /* limit benchmarking to max 256KiB, when EdonR is slower then this: */
 #define        LIMIT_PERF_MBS  300
@@ -56,25 +56,26 @@ static int chksum_stat_cnt = 0;
 static kstat_t *chksum_kstat = NULL;
 
 /*
- * i3-1005G1 test output:
+ * Sample output on i3-1005G1 System:
  *
- * implementation     1k      4k     16k     64k    256k      1m      4m
- * fletcher-4       5421   15001   26468   32555   34720   32801   18847
- * edonr-generic    1196    1602    1761    1749    1762    1759    1751
- * skein-generic     546     591     608     615     619     612     616
- * sha256-generic    246     270     274     274     277     275     276
- * sha256-avx        262     296     304     307     307     307     306
- * sha256-sha-ni     769    1072    1172    1220    1219    1232    1228
- * sha256-openssl    240     300     316     314     304     285     276
- * sha512-generic    333     374     385     392     391     393     392
- * sha512-openssl    353     441     467     476     472     467     426
- * sha512-avx        362     444     473     475     479     476     478
- * sha512-avx2       394     500     530     538     543     545     542
- * blake3-generic    308     313     313     313     312     313     312
- * blake3-sse2       402    1289    1423    1446    1432    1458    1413
- * blake3-sse41      427    1470    1625    1704    1679    1607    1629
- * blake3-avx2       428    1920    3095    3343    3356    3318    3204
- * blake3-avx512     473    2687    4905    5836    5844    5643    5374
+ * implementation   1k      4k     16k     64k    256k      1m      4m     16m
+ * edonr-generic  1278    1625    1769    1776    1783    1778    1771    1767
+ * skein-generic   548     594     613     623     621     623     621     486
+ * sha256-generic  255     270     281     278     279     281     283     283
+ * sha256-x64      288     310     316     317     318     317     317     316
+ * sha256-ssse3    304     342     351     355     356     357     356     356
+ * sha256-avx      311     348     359     362     362     363     363     362
+ * sha256-avx2     330     378     389     395     395     395     395     395
+ * sha256-shani    908    1127    1212    1230    1233    1234    1223    1230
+ * sha512-generic  359     409     431     427     429     430     428     423
+ * sha512-x64      420     473     490     496     497     497     496     495
+ * sha512-avx      406     522     546     560     560     560     556     560
+ * sha512-avx2     464     568     601     606     609     610     607     608
+ * blake3-generic  330     327     324     323     324     320     323     322
+ * blake3-sse2     424    1366    1449    1468    1458    1453    1395    1408
+ * blake3-sse41    453    1554    1658    1703    1689    1669    1622    1630
+ * blake3-avx2     452    2013    3225    3351    3356    3261    3076    3101
+ * blake3-avx512   498    2869    5269    5926    5872    5643    5014    5005
  */
 static int
 chksum_kstat_headers(char *buf, size_t size)
@@ -237,25 +238,30 @@ abort:
 static void
 chksum_benchmark(void)
 {
-
 #ifndef _KERNEL
        /* we need the benchmark only for the kernel module */
        return;
 #endif
 
        chksum_stat_t *cs;
-       int cbid = 0;
-       uint64_t max = 0;
-       uint32_t id, id_save;
-
-       /* space for the benchmark times */
-       chksum_stat_cnt = 4;
-       chksum_stat_cnt += blake3_impl_getcnt();
+       uint64_t max;
+       uint32_t id, cbid = 0, id_save;
+       const zfs_impl_t *blake3 = zfs_impl_get_ops("blake3");
+       const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+       const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+
+       /* count implementations */
+       chksum_stat_cnt = 2;
+       chksum_stat_cnt += sha256->getcnt();
+       chksum_stat_cnt += sha512->getcnt();
+       chksum_stat_cnt += blake3->getcnt();
        chksum_stat_data = kmem_zalloc(
            sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
 
        /* edonr - needs to be the first one here (slow CPU check) */
        cs = &chksum_stat_data[cbid++];
+
+       /* edonr */
        cs->init = abd_checksum_edonr_tmpl_init;
        cs->func = abd_checksum_edonr_native;
        cs->free = abd_checksum_edonr_tmpl_free;
@@ -273,42 +279,58 @@ chksum_benchmark(void)
        chksum_benchit(cs);
 
        /* sha256 */
-       cs = &chksum_stat_data[cbid++];
-       cs->init = 0;
-       cs->func = abd_checksum_SHA256;
-       cs->free = 0;
-       cs->name = "sha256";
-       cs->impl = "generic";
-       chksum_benchit(cs);
+       id_save = sha256->getid();
+       for (max = 0, id = 0; id < sha256->getcnt(); id++) {
+               sha256->setid(id);
+               cs = &chksum_stat_data[cbid++];
+               cs->init = 0;
+               cs->func = abd_checksum_sha256;
+               cs->free = 0;
+               cs->name = sha256->name;
+               cs->impl = sha256->getname();
+               chksum_benchit(cs);
+               if (cs->bs256k > max) {
+                       max = cs->bs256k;
+                       sha256->set_fastest(id);
+               }
+       }
+       sha256->setid(id_save);
 
        /* sha512 */
-       cs = &chksum_stat_data[cbid++];
-       cs->init = 0;
-       cs->func = abd_checksum_SHA512_native;
-       cs->free = 0;
-       cs->name = "sha512";
-       cs->impl = "generic";
-       chksum_benchit(cs);
+       id_save = sha512->getid();
+       for (max = 0, id = 0; id < sha512->getcnt(); id++) {
+               sha512->setid(id);
+               cs = &chksum_stat_data[cbid++];
+               cs->init = 0;
+               cs->func = abd_checksum_sha512_native;
+               cs->free = 0;
+               cs->name = sha512->name;
+               cs->impl = sha512->getname();
+               chksum_benchit(cs);
+               if (cs->bs256k > max) {
+                       max = cs->bs256k;
+                       sha512->set_fastest(id);
+               }
+       }
+       sha512->setid(id_save);
 
        /* blake3 */
-       id_save = blake3_impl_getid();
-       for (id = 0; id < blake3_impl_getcnt(); id++) {
-               blake3_impl_setid(id);
+       id_save = blake3->getid();
+       for (max = 0, id = 0; id < blake3->getcnt(); id++) {
+               blake3->setid(id);
                cs = &chksum_stat_data[cbid++];
                cs->init = abd_checksum_blake3_tmpl_init;
                cs->func = abd_checksum_blake3_native;
                cs->free = abd_checksum_blake3_tmpl_free;
-               cs->name = "blake3";
-               cs->impl = blake3_impl_getname();
+               cs->name = blake3->name;
+               cs->impl = blake3->getname();
                chksum_benchit(cs);
                if (cs->bs256k > max) {
                        max = cs->bs256k;
-                       blake3_impl_set_fastest(id);
+                       blake3->set_fastest(id);
                }
        }
-
-       /* restore initial value */
-       blake3_impl_setid(id_save);
+       blake3->setid(id_save);
 }
 
 void
diff --git a/module/zfs/zfs_impl.c b/module/zfs/zfs_impl.c
new file mode 100644 (file)
index 0000000..20322ff
--- /dev/null
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or https://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
+ */
+
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_impl.h>
+
+#include <sys/blake3.h>
+#include <sys/sha2.h>
+
+/*
+ * impl_ops - backend for implementations of algorithms
+ */
+const zfs_impl_t *impl_ops[] = {
+       &zfs_blake3_ops,
+       &zfs_sha256_ops,
+       &zfs_sha512_ops,
+       NULL
+};
+
+/*
+ * zfs_impl_get_ops - Get the API functions for an impl backend
+ */
+const zfs_impl_t *
+zfs_impl_get_ops(const char *algo)
+{
+       const zfs_impl_t **ops = impl_ops;
+
+       if (!algo || !*algo)
+               return (*ops);
+
+       for (; *ops; ops++) {
+               if (strcmp(algo, (*ops)->name) == 0)
+                       break;
+       }
+
+       ASSERT3P(ops, !=, NULL);
+       return (*ops);
+}
index 3743eaa532ef20ce68a347c4f022fbfa4fb7aeda..6090959c5b8ced61f19b7e9b0ac734c585ae8e1d 100644 (file)
@@ -165,10 +165,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
        {{NULL, NULL}, NULL, NULL, 0, "on"},
        {{abd_checksum_off,             abd_checksum_off},
            NULL, NULL, 0, "off"},
-       {{abd_checksum_SHA256,          abd_checksum_SHA256},
+       {{abd_checksum_sha256,          abd_checksum_sha256},
            NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
            "label"},
-       {{abd_checksum_SHA256,          abd_checksum_SHA256},
+       {{abd_checksum_sha256,          abd_checksum_sha256},
            NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
            "gang_header"},
        {{abd_fletcher_2_native,        abd_fletcher_2_byteswap},
@@ -177,14 +177,14 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
            NULL, NULL, 0, "fletcher2"},
        {{abd_fletcher_4_native,        abd_fletcher_4_byteswap},
            NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
-       {{abd_checksum_SHA256,          abd_checksum_SHA256},
+       {{abd_checksum_sha256,          abd_checksum_sha256},
            NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
            ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
        {{abd_fletcher_4_native,        abd_fletcher_4_byteswap},
            NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
        {{abd_checksum_off,             abd_checksum_off},
            NULL, NULL, 0, "noparity"},
-       {{abd_checksum_SHA512_native,   abd_checksum_SHA512_byteswap},
+       {{abd_checksum_sha512_native,   abd_checksum_sha512_byteswap},
            NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
            ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
        {{abd_checksum_skein_native,    abd_checksum_skein_byteswap},
index d99e8757a24cdeffe39cf5a4bf98cedb948581cb..efcf812d77499f0ecd00534f36244c1da72c9d1e 100644 (file)
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+
 #include <sys/time.h>
-#define        _SHA2_IMPL
 #include <sys/sha2.h>
 #include <sys/stdtypes.h>
-
+#include <sys/zfs_impl.h>
 
 /*
  * Test messages from:
@@ -174,9 +174,19 @@ main(int argc, char *argv[])
        boolean_t       failed = B_FALSE;
        uint64_t        cpu_mhz = 0;
 
+       const zfs_impl_t *sha256 = zfs_impl_get_ops("sha256");
+       const zfs_impl_t *sha512 = zfs_impl_get_ops("sha512");
+       uint32_t id;
+
        if (argc == 2)
                cpu_mhz = atoi(argv[1]);
 
+       if (!sha256)
+               return (1);
+
+       if (!sha512)
+               return (1);
+
 #define        SHA2_ALGO_TEST(_m, mode, diglen, testdigest)                    \
        do {                                                            \
                SHA2_CTX                ctx;                            \
@@ -194,7 +204,7 @@ main(int argc, char *argv[])
                }                                                       \
        } while (0)
 
-#define        SHA2_PERF_TEST(mode, diglen)                                    \
+#define        SHA2_PERF_TEST(mode, diglen, name)                              \
        do {                                                            \
                SHA2_CTX        ctx;                                    \
                uint8_t         digest[diglen / 8];                     \
@@ -216,8 +226,8 @@ main(int argc, char *argv[])
                        cpb = (cpu_mhz * 1e6 * ((double)delta /         \
                            1000000)) / (8192 * 128 * 1024);            \
                }                                                       \
-               (void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode,    \
-                   (u_longlong_t)delta, cpb);                          \
+               (void) printf("sha%s-%-9s%7llu us (%.02f CPB)\n", #mode,\
+                   name, (u_longlong_t)delta, cpb);                    \
        } while (0)
 
        (void) printf("Running algorithm correctness tests:\n");
@@ -237,8 +247,18 @@ main(int argc, char *argv[])
 
        (void) printf("Running performance tests (hashing 1024 MiB of "
            "data):\n");
-       SHA2_PERF_TEST(256, 256);
-       SHA2_PERF_TEST(512, 512);
+
+       for (id = 0; id < sha256->getcnt(); id++) {
+               sha256->setid(id);
+               const char *name = sha256->getname();
+               SHA2_PERF_TEST(256, 256, name);
+       }
+
+       for (id = 0; id < sha512->getcnt(); id++) {
+               sha512->setid(id);
+               const char *name = sha512->getname();
+               SHA2_PERF_TEST(512, 512, name);
+       }
 
        return (0);
 }