]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/jerasure/gf-complete/src/neon/gf_w64_neon.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / erasure-code / jerasure / gf-complete / src / neon / gf_w64_neon.c
1 /*
2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5 *
6 * Copyright (c) 2014: Janne Grunau <j@jannau.net>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * - Neither the name of the University of Tennessee nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 * gf_w64_neon.c
38 *
39 * Neon routines for 64-bit Galois fields
40 *
41 */
42
43 #include "gf_int.h"
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include "gf_w64.h"
47
48
49 #ifndef ARCH_AARCH64
50 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
51 vtbl2_u8(tbl, vget_high_u8(v)))
52 #endif
53
54 static
55 inline
56 void
57 neon_w64_split_4_lazy_altmap_multiply_region(gf_t *gf, uint64_t *src,
58 uint64_t *dst, uint64_t *d_end,
59 uint64_t val, int xor)
60 {
61 unsigned i, j, k;
62 uint8_t btable[16];
63 #ifdef ARCH_AARCH64
64 uint8x16_t tables[16][8];
65 #else
66 uint8x8x2_t tables[16][8];
67 #endif
68 uint8x16_t p[8], mask1, si;
69
70 gf_internal_t *h = (gf_internal_t *) gf->scratch;
71 struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
72
73 for (i = 0; i < 16; i++) {
74 for (j = 0; j < 8; j++) {
75 for (k = 0; k < 16; k++) {
76 btable[k] = (uint8_t) ld->tables[i][k];
77 ld->tables[i][k] >>= 8;
78 }
79 #ifdef ARCH_AARCH64
80 tables[i][j] = vld1q_u8(btable);
81 #else
82 tables[i][j].val[0] = vld1_u8(btable);
83 tables[i][j].val[1] = vld1_u8(btable + 8);
84 #endif
85 }
86 }
87
88 mask1 = vdupq_n_u8(0xf);
89
90 while (dst < d_end) {
91
92 if (xor) {
93 for (i = 0; i < 8; i++)
94 p[i] = vld1q_u8((uint8_t *) (dst + i * 2));
95 } else {
96 for (i = 0; i < 8; i++)
97 p[i] = vdupq_n_u8(0);
98 }
99
100 i = 0;
101 for (k = 0; k < 8; k++) {
102 uint8x16_t v0 = vld1q_u8((uint8_t *) src);
103 src += 2;
104
105 si = vandq_u8(v0, mask1);
106 for (j = 0; j < 8; j++) {
107 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
108 }
109 i++;
110 si = vshrq_n_u8(v0, 4);
111 for (j = 0; j < 8; j++) {
112 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
113 }
114 i++;
115
116 }
117 for (i = 0; i < 8; i++) {
118 vst1q_u8((uint8_t *) dst, p[i]);
119 dst += 2;
120 }
121 }
122 }
123
124 static
125 inline
126 void
127 neon_w64_split_4_lazy_multiply_region(gf_t *gf, uint64_t *src, uint64_t *dst,
128 uint64_t *d_end, uint64_t val, int xor)
129 {
130 unsigned i, j, k;
131 uint8_t btable[16];
132 #ifdef ARCH_AARCH64
133 uint8x16_t tables[16][8];
134 #else
135 uint8x8x2_t tables[16][8];
136 #endif
137 uint8x16_t p[8], mask1, si;
138 uint64x2_t st[8];
139 uint32x4x2_t s32[4];
140 uint16x8x2_t s16[4];
141 uint8x16x2_t s8[4];
142
143 gf_internal_t *h = (gf_internal_t *) gf->scratch;
144 struct gf_split_4_64_lazy_data *ld = (struct gf_split_4_64_lazy_data *) h->private;
145
146 for (i = 0; i < 16; i++) {
147 for (j = 0; j < 8; j++) {
148 for (k = 0; k < 16; k++) {
149 btable[k] = (uint8_t) ld->tables[i][k];
150 ld->tables[i][k] >>= 8;
151 }
152 #ifdef ARCH_AARCH64
153 tables[i][j] = vld1q_u8(btable);
154 #else
155 tables[i][j].val[0] = vld1_u8(btable);
156 tables[i][j].val[1] = vld1_u8(btable + 8);
157 #endif
158 }
159 }
160
161 mask1 = vdupq_n_u8(0xf);
162
163 while (dst < d_end) {
164
165 for (k = 0; k < 8; k++) {
166 st[k] = vld1q_u64(src);
167 src += 2;
168 p[k] = vdupq_n_u8(0);
169 }
170
171 s32[0] = vuzpq_u32(vreinterpretq_u32_u64(st[0]),
172 vreinterpretq_u32_u64(st[1]));
173 s32[1] = vuzpq_u32(vreinterpretq_u32_u64(st[2]),
174 vreinterpretq_u32_u64(st[3]));
175 s32[2] = vuzpq_u32(vreinterpretq_u32_u64(st[4]),
176 vreinterpretq_u32_u64(st[5]));
177 s32[3] = vuzpq_u32(vreinterpretq_u32_u64(st[6]),
178 vreinterpretq_u32_u64(st[7]));
179
180 s16[0] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[0]),
181 vreinterpretq_u16_u32(s32[1].val[0]));
182 s16[1] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[0]),
183 vreinterpretq_u16_u32(s32[3].val[0]));
184 s16[2] = vuzpq_u16(vreinterpretq_u16_u32(s32[0].val[1]),
185 vreinterpretq_u16_u32(s32[1].val[1]));
186 s16[3] = vuzpq_u16(vreinterpretq_u16_u32(s32[2].val[1]),
187 vreinterpretq_u16_u32(s32[3].val[1]));
188
189 s8[0] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[0]),
190 vreinterpretq_u8_u16(s16[1].val[0]));
191 s8[1] = vuzpq_u8(vreinterpretq_u8_u16(s16[0].val[1]),
192 vreinterpretq_u8_u16(s16[1].val[1]));
193 s8[2] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[0]),
194 vreinterpretq_u8_u16(s16[3].val[0]));
195 s8[3] = vuzpq_u8(vreinterpretq_u8_u16(s16[2].val[1]),
196 vreinterpretq_u8_u16(s16[3].val[1]));
197
198 i = 0;
199 for (k = 0; k < 8; k++) {
200 si = vandq_u8(s8[k >> 1].val[k & 1], mask1);
201 for (j = 0; j < 8; j++) {
202 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
203 }
204 i++;
205 si = vshrq_n_u8(s8[k >> 1].val[k & 1], 4);
206 for (j = 0; j < 8; j++) {
207 p[j] = veorq_u8(p[j], vqtbl1q_u8(tables[i][j], si));
208 }
209 i++;
210 }
211
212 s8[0] = vzipq_u8(p[0], p[1]);
213 s8[1] = vzipq_u8(p[2], p[3]);
214 s8[2] = vzipq_u8(p[4], p[5]);
215 s8[3] = vzipq_u8(p[6], p[7]);
216
217 s16[0] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[0]),
218 vreinterpretq_u16_u8(s8[1].val[0]));
219 s16[1] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[0]),
220 vreinterpretq_u16_u8(s8[3].val[0]));
221 s16[2] = vzipq_u16(vreinterpretq_u16_u8(s8[0].val[1]),
222 vreinterpretq_u16_u8(s8[1].val[1]));
223 s16[3] = vzipq_u16(vreinterpretq_u16_u8(s8[2].val[1]),
224 vreinterpretq_u16_u8(s8[3].val[1]));
225
226 s32[0] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[0]),
227 vreinterpretq_u32_u16(s16[1].val[0]));
228 s32[1] = vzipq_u32(vreinterpretq_u32_u16(s16[0].val[1]),
229 vreinterpretq_u32_u16(s16[1].val[1]));
230 s32[2] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[0]),
231 vreinterpretq_u32_u16(s16[3].val[0]));
232 s32[3] = vzipq_u32(vreinterpretq_u32_u16(s16[2].val[1]),
233 vreinterpretq_u32_u16(s16[3].val[1]));
234
235 for (k = 0; k < 8; k ++) {
236 st[k] = vreinterpretq_u64_u32(s32[k >> 1].val[k & 1]);
237 }
238
239 if (xor) {
240 for (i = 0; i < 8; i++) {
241 uint64x2_t t1 = vld1q_u64(dst);
242 vst1q_u64(dst, veorq_u64(st[i], t1));
243 dst += 2;
244 }
245 } else {
246 for (i = 0; i < 8; i++) {
247 vst1q_u64(dst, st[i]);
248 dst += 2;
249 }
250 }
251
252 }
253 }
254
255 static
256 void
257 gf_w64_neon_split_4_lazy_multiply_region(gf_t *gf, void *src, void *dest,
258 uint64_t val, int bytes, int xor,
259 int altmap)
260 {
261 gf_internal_t *h;
262 int i, j, k;
263 uint64_t pp, v, *s64, *d64, *top;
264 struct gf_split_4_64_lazy_data *ld;
265 gf_region_data rd;
266
267 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
268 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
269
270 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 128);
271 gf_do_initial_region_alignment(&rd);
272
273 s64 = (uint64_t *) rd.s_start;
274 d64 = (uint64_t *) rd.d_start;
275 top = (uint64_t *) rd.d_top;
276
277 h = (gf_internal_t *) gf->scratch;
278 pp = h->prim_poly;
279 ld = (struct gf_split_4_64_lazy_data *) h->private;
280
281 v = val;
282 for (i = 0; i < 16; i++) {
283 ld->tables[i][0] = 0;
284 for (j = 1; j < 16; j <<= 1) {
285 for (k = 0; k < j; k++) {
286 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
287 }
288 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
289 }
290 }
291
292 if (altmap) {
293 if (xor)
294 neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 1);
295 else
296 neon_w64_split_4_lazy_altmap_multiply_region(gf, s64, d64, top, val, 0);
297 } else {
298 if (xor)
299 neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 1);
300 else
301 neon_w64_split_4_lazy_multiply_region(gf, s64, d64, top, val, 0);
302 }
303
304 gf_do_final_region_alignment(&rd);
305 }
306
307 static
308 void
309 gf_w64_split_4_64_lazy_multiply_region_neon(gf_t *gf, void *src, void *dest,
310 uint64_t val, int bytes, int xor)
311 {
312 gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 0);
313 }
314
315 static
316 void
317 gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t *gf, void *src,
318 void *dest, uint64_t val,
319 int bytes, int xor)
320 {
321 gf_w64_neon_split_4_lazy_multiply_region(gf, src, dest, val, bytes, xor, 1);
322 }
323
324 void gf_w64_neon_split_init(gf_t *gf)
325 {
326 gf_internal_t *h = (gf_internal_t *) gf->scratch;
327
328 if (h->region_type & GF_REGION_ALTMAP)
329 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_altmap_multiply_region_neon)
330 else
331 SET_FUNCTION(gf,multiply_region,w64,gf_w64_split_4_64_lazy_multiply_region_neon)
332
333 }