]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add AVX512BW variant of fletcher
authorRomain Dolbeau <romain.dolbeau@atos.net>
Wed, 30 Oct 2019 19:26:14 +0000 (20:26 +0100)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 30 Oct 2019 19:26:14 +0000 (12:26 -0700)
It is much faster than AVX512F when byteswapping on Skylake-SP
and newer, as we can do the byteswap in a single vshufb instead
of many instructions.

Reviewed by: Gvozden Neskovic <neskovic@gmail.com>
Reviewed-by: Chunwei Chen <tuxoko@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Romain Dolbeau <romain.dolbeau@atos.net>
Closes #9517

include/zfs_fletcher.h
man/man5/zfs-module-parameters.5
module/zcommon/zfs_fletcher.c
module/zcommon/zfs_fletcher_avx512.c

index 5c7a61c56259073ba9d81fe3c2709abcf5b30d41..9e8b2cf7c7296e102ef4fb4d5cb50b396a5f36de 100644 (file)
@@ -143,6 +143,10 @@ extern const fletcher_4_ops_t fletcher_4_avx2_ops;
 extern const fletcher_4_ops_t fletcher_4_avx512f_ops;
 #endif
 
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+extern const fletcher_4_ops_t fletcher_4_avx512bw_ops;
+#endif
+
 #if defined(__aarch64__)
 extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
 #endif
index c711f6de61bfd07e4c2ec4f57bac136ac3072815..1c773435c9fb1c529407b63e7594d7ba805ac896 100644 (file)
@@ -1507,7 +1507,7 @@ Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR.
 Select a fletcher 4 implementation.
 .sp
 Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR,
-\fBavx2\fR, \fBavx512f\fR, and \fBaarch64_neon\fR.
+\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR.
 All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction
 set extensions to be available and will only appear if ZFS detects that they are
 present at runtime. If multiple implementations of fletcher 4 are available,
index 1280ace31899bf7acd03d95c337f3b7464ce83f2..f955dc8d9e371889cd38ce5004eae97b05c2b341 100644 (file)
@@ -184,6 +184,9 @@ static const fletcher_4_ops_t *fletcher_4_impls[] = {
 #if defined(__x86_64) && defined(HAVE_AVX512F)
        &fletcher_4_avx512f_ops,
 #endif
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+       &fletcher_4_avx512bw_ops,
+#endif
 #if defined(__aarch64__)
        &fletcher_4_aarch64_neon_ops,
 #endif
index 43806f264e5ecbabf69c8479b2b31101d6f9601e..d33d2dc33f36cf62b3fec08aa59fca5825a46de2 100644 (file)
@@ -171,4 +171,53 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
        .name = "avx512f"
 };
 
+#if defined(HAVE_AVX512BW)
+static void
+fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
+{
+       static const zfs_fletcher_avx512_t mask = {
+               .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+               0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+               0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+               0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
+       };
+       const uint32_t *ip = buf;
+       const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+       kfpu_begin();
+
+       FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+       __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
+
+       for (; ip < ipend; ip += 8) {
+               __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
+
+               __asm("vpshufb %zmm5, %zmm4, %zmm4");
+
+               __asm("vpaddq %zmm4, %zmm0, %zmm0");
+               __asm("vpaddq %zmm0, %zmm1, %zmm1");
+               __asm("vpaddq %zmm1, %zmm2, %zmm2");
+               __asm("vpaddq %zmm2, %zmm3, %zmm3");
+       }
+
+       FLETCHER_4_AVX512_SAVE_CTX(ctx)
+
+       kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
+
+const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
+       .init_native = fletcher_4_avx512f_init,
+       .fini_native = fletcher_4_avx512f_fini,
+       .compute_native = fletcher_4_avx512f_native,
+       .init_byteswap = fletcher_4_avx512f_init,
+       .fini_byteswap = fletcher_4_avx512f_fini,
+       .compute_byteswap = fletcher_4_avx512bw_byteswap,
+       .valid = fletcher_4_avx512f_valid,
+       .name = "avx512bw"
+};
+#endif
+
 #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */