]>
Commit | Line | Data |
---|---|---|
70b258fc GN |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2016 Gvozden Nešković. All rights reserved. | |
23 | */ | |
24 | ||
25 | #if defined(__x86_64) && defined(HAVE_AVX512F) | |
26 | ||
27 | #include <linux/simd_x86.h> | |
28 | #include <sys/byteorder.h> | |
c28a6773 | 29 | #include <sys/frame.h> |
70b258fc | 30 | #include <sys/spa_checksum.h> |
93ce2b4c | 31 | #include <sys/strings.h> |
70b258fc GN |
32 | #include <zfs_fletcher.h> |
33 | ||
34 | #define __asm __asm__ __volatile__ | |
35 | ||
5bf703b8 GN |
36 | static void |
37 | fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) | |
38 | { | |
39 | bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t)); | |
40 | } | |
70b258fc GN |
41 | |
42 | static void | |
5bf703b8 | 43 | fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) |
70b258fc | 44 | { |
5bf703b8 GN |
45 | static const uint64_t |
46 | CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, | |
47 | CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, | |
48 | DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, | |
49 | DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, | |
50 | DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; | |
51 | ||
52 | uint64_t A, B, C, D; | |
53 | uint64_t i; | |
70b258fc | 54 | |
5bf703b8 GN |
55 | A = ctx->avx512[0].v[0]; |
56 | B = 8 * ctx->avx512[1].v[0]; | |
57 | C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; | |
58 | D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + | |
59 | DcB[0] * ctx->avx512[1].v[0]; | |
60 | ||
61 | for (i = 1; i < 8; i++) { | |
62 | A += ctx->avx512[0].v[i]; | |
63 | B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; | |
64 | C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + | |
65 | CcA[i] * ctx->avx512[0].v[i]; | |
66 | D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + | |
67 | DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; | |
68 | } | |
69 | ||
70 | ZIO_SET_CHECKSUM(zcp, A, B, C, D); | |
71 | } | |
72 | ||
73 | #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ | |
74 | { \ | |
75 | __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ | |
76 | __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ | |
77 | __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ | |
78 | __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ | |
79 | } | |
80 | ||
81 | #define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ | |
82 | { \ | |
83 | __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ | |
84 | __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ | |
85 | __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ | |
86 | __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ | |
70b258fc GN |
87 | } |
88 | ||
89 | static void | |
5bf703b8 | 90 | fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) |
70b258fc GN |
91 | { |
92 | const uint32_t *ip = buf; | |
93 | const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); | |
94 | ||
5bf703b8 GN |
95 | kfpu_begin(); |
96 | ||
97 | FLETCHER_4_AVX512_RESTORE_CTX(ctx); | |
98 | ||
70b258fc GN |
99 | for (; ip < ipend; ip += 8) { |
100 | __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); | |
101 | __asm("vpaddq %zmm4, %zmm0, %zmm0"); | |
102 | __asm("vpaddq %zmm0, %zmm1, %zmm1"); | |
103 | __asm("vpaddq %zmm1, %zmm2, %zmm2"); | |
104 | __asm("vpaddq %zmm2, %zmm3, %zmm3"); | |
105 | } | |
5bf703b8 GN |
106 | |
107 | FLETCHER_4_AVX512_SAVE_CTX(ctx); | |
108 | ||
109 | kfpu_end(); | |
70b258fc | 110 | } |
c28a6773 | 111 | STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native); |
70b258fc GN |
112 | |
113 | static void | |
5bf703b8 GN |
114 | fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, |
115 | uint64_t size) | |
70b258fc GN |
116 | { |
117 | static const uint64_t byteswap_mask = 0xFFULL; | |
118 | const uint32_t *ip = buf; | |
119 | const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); | |
120 | ||
5bf703b8 GN |
121 | kfpu_begin(); |
122 | ||
123 | FLETCHER_4_AVX512_RESTORE_CTX(ctx); | |
124 | ||
70b258fc GN |
125 | __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); |
126 | __asm("vpsllq $8, %zmm8, %zmm9"); | |
127 | __asm("vpsllq $16, %zmm8, %zmm10"); | |
128 | __asm("vpsllq $24, %zmm8, %zmm11"); | |
129 | ||
130 | for (; ip < ipend; ip += 8) { | |
131 | __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip)); | |
132 | ||
133 | __asm("vpsrlq $24, %zmm5, %zmm6"); | |
134 | __asm("vpandd %zmm8, %zmm6, %zmm6"); | |
135 | __asm("vpsrlq $8, %zmm5, %zmm7"); | |
136 | __asm("vpandd %zmm9, %zmm7, %zmm7"); | |
137 | __asm("vpord %zmm6, %zmm7, %zmm4"); | |
138 | __asm("vpsllq $8, %zmm5, %zmm6"); | |
139 | __asm("vpandd %zmm10, %zmm6, %zmm6"); | |
140 | __asm("vpord %zmm6, %zmm4, %zmm4"); | |
141 | __asm("vpsllq $24, %zmm5, %zmm5"); | |
142 | __asm("vpandd %zmm11, %zmm5, %zmm5"); | |
143 | __asm("vpord %zmm5, %zmm4, %zmm4"); | |
144 | ||
145 | __asm("vpaddq %zmm4, %zmm0, %zmm0"); | |
146 | __asm("vpaddq %zmm0, %zmm1, %zmm1"); | |
147 | __asm("vpaddq %zmm1, %zmm2, %zmm2"); | |
148 | __asm("vpaddq %zmm2, %zmm3, %zmm3"); | |
149 | } | |
70b258fc | 150 | |
5bf703b8 | 151 | FLETCHER_4_AVX512_SAVE_CTX(ctx) |
70b258fc GN |
152 | |
153 | kfpu_end(); | |
70b258fc | 154 | } |
c28a6773 | 155 | STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); |
70b258fc GN |
156 | |
157 | static boolean_t | |
158 | fletcher_4_avx512f_valid(void) | |
159 | { | |
160 | return (zfs_avx512f_available()); | |
161 | } | |
162 | ||
163 | const fletcher_4_ops_t fletcher_4_avx512f_ops = { | |
fc897b24 GN |
164 | .init_native = fletcher_4_avx512f_init, |
165 | .fini_native = fletcher_4_avx512f_fini, | |
166 | .compute_native = fletcher_4_avx512f_native, | |
167 | .init_byteswap = fletcher_4_avx512f_init, | |
168 | .fini_byteswap = fletcher_4_avx512f_fini, | |
70b258fc GN |
169 | .compute_byteswap = fletcher_4_avx512f_byteswap, |
170 | .valid = fletcher_4_avx512f_valid, | |
171 | .name = "avx512f" | |
172 | }; | |
173 | ||
174 | #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ |