2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
6 * Copyright (c) 2014: Janne Grunau <j@jannau.net>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
20 * - Neither the name of the University of Tennessee nor the names of its
21 * contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
31 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
32 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
39 * Neon routines for 64-bit Galois fields
50 #define vqtbl1q_u8(tbl, v) vcombine_u8(vtbl2_u8(tbl, vget_low_u8(v)), \
51 vtbl2_u8(tbl, vget_high_u8(v)))
57 neon_w64_split_4_lazy_altmap_multiply_region(gf_t
*gf
, uint64_t *src
,
58 uint64_t *dst
, uint64_t *d_end
,
59 uint64_t val
, int xor)
64 uint8x16_t tables
[16][8];
66 uint8x8x2_t tables
[16][8];
68 uint8x16_t p
[8], mask1
, si
;
70 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
71 struct gf_split_4_64_lazy_data
*ld
= (struct gf_split_4_64_lazy_data
*) h
->private;
73 for (i
= 0; i
< 16; i
++) {
74 for (j
= 0; j
< 8; j
++) {
75 for (k
= 0; k
< 16; k
++) {
76 btable
[k
] = (uint8_t) ld
->tables
[i
][k
];
77 ld
->tables
[i
][k
] >>= 8;
80 tables
[i
][j
] = vld1q_u8(btable
);
82 tables
[i
][j
].val
[0] = vld1_u8(btable
);
83 tables
[i
][j
].val
[1] = vld1_u8(btable
+ 8);
88 mask1
= vdupq_n_u8(0xf);
93 for (i
= 0; i
< 8; i
++)
94 p
[i
] = vld1q_u8((uint8_t *) (dst
+ i
* 2));
96 for (i
= 0; i
< 8; i
++)
101 for (k
= 0; k
< 8; k
++) {
102 uint8x16_t v0
= vld1q_u8((uint8_t *) src
);
105 si
= vandq_u8(v0
, mask1
);
106 for (j
= 0; j
< 8; j
++) {
107 p
[j
] = veorq_u8(p
[j
], vqtbl1q_u8(tables
[i
][j
], si
));
110 si
= vshrq_n_u8(v0
, 4);
111 for (j
= 0; j
< 8; j
++) {
112 p
[j
] = veorq_u8(p
[j
], vqtbl1q_u8(tables
[i
][j
], si
));
117 for (i
= 0; i
< 8; i
++) {
118 vst1q_u8((uint8_t *) dst
, p
[i
]);
127 neon_w64_split_4_lazy_multiply_region(gf_t
*gf
, uint64_t *src
, uint64_t *dst
,
128 uint64_t *d_end
, uint64_t val
, int xor)
133 uint8x16_t tables
[16][8];
135 uint8x8x2_t tables
[16][8];
137 uint8x16_t p
[8], mask1
, si
;
143 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
144 struct gf_split_4_64_lazy_data
*ld
= (struct gf_split_4_64_lazy_data
*) h
->private;
146 for (i
= 0; i
< 16; i
++) {
147 for (j
= 0; j
< 8; j
++) {
148 for (k
= 0; k
< 16; k
++) {
149 btable
[k
] = (uint8_t) ld
->tables
[i
][k
];
150 ld
->tables
[i
][k
] >>= 8;
153 tables
[i
][j
] = vld1q_u8(btable
);
155 tables
[i
][j
].val
[0] = vld1_u8(btable
);
156 tables
[i
][j
].val
[1] = vld1_u8(btable
+ 8);
161 mask1
= vdupq_n_u8(0xf);
163 while (dst
< d_end
) {
165 for (k
= 0; k
< 8; k
++) {
166 st
[k
] = vld1q_u64(src
);
168 p
[k
] = vdupq_n_u8(0);
171 s32
[0] = vuzpq_u32(vreinterpretq_u32_u64(st
[0]),
172 vreinterpretq_u32_u64(st
[1]));
173 s32
[1] = vuzpq_u32(vreinterpretq_u32_u64(st
[2]),
174 vreinterpretq_u32_u64(st
[3]));
175 s32
[2] = vuzpq_u32(vreinterpretq_u32_u64(st
[4]),
176 vreinterpretq_u32_u64(st
[5]));
177 s32
[3] = vuzpq_u32(vreinterpretq_u32_u64(st
[6]),
178 vreinterpretq_u32_u64(st
[7]));
180 s16
[0] = vuzpq_u16(vreinterpretq_u16_u32(s32
[0].val
[0]),
181 vreinterpretq_u16_u32(s32
[1].val
[0]));
182 s16
[1] = vuzpq_u16(vreinterpretq_u16_u32(s32
[2].val
[0]),
183 vreinterpretq_u16_u32(s32
[3].val
[0]));
184 s16
[2] = vuzpq_u16(vreinterpretq_u16_u32(s32
[0].val
[1]),
185 vreinterpretq_u16_u32(s32
[1].val
[1]));
186 s16
[3] = vuzpq_u16(vreinterpretq_u16_u32(s32
[2].val
[1]),
187 vreinterpretq_u16_u32(s32
[3].val
[1]));
189 s8
[0] = vuzpq_u8(vreinterpretq_u8_u16(s16
[0].val
[0]),
190 vreinterpretq_u8_u16(s16
[1].val
[0]));
191 s8
[1] = vuzpq_u8(vreinterpretq_u8_u16(s16
[0].val
[1]),
192 vreinterpretq_u8_u16(s16
[1].val
[1]));
193 s8
[2] = vuzpq_u8(vreinterpretq_u8_u16(s16
[2].val
[0]),
194 vreinterpretq_u8_u16(s16
[3].val
[0]));
195 s8
[3] = vuzpq_u8(vreinterpretq_u8_u16(s16
[2].val
[1]),
196 vreinterpretq_u8_u16(s16
[3].val
[1]));
199 for (k
= 0; k
< 8; k
++) {
200 si
= vandq_u8(s8
[k
>> 1].val
[k
& 1], mask1
);
201 for (j
= 0; j
< 8; j
++) {
202 p
[j
] = veorq_u8(p
[j
], vqtbl1q_u8(tables
[i
][j
], si
));
205 si
= vshrq_n_u8(s8
[k
>> 1].val
[k
& 1], 4);
206 for (j
= 0; j
< 8; j
++) {
207 p
[j
] = veorq_u8(p
[j
], vqtbl1q_u8(tables
[i
][j
], si
));
212 s8
[0] = vzipq_u8(p
[0], p
[1]);
213 s8
[1] = vzipq_u8(p
[2], p
[3]);
214 s8
[2] = vzipq_u8(p
[4], p
[5]);
215 s8
[3] = vzipq_u8(p
[6], p
[7]);
217 s16
[0] = vzipq_u16(vreinterpretq_u16_u8(s8
[0].val
[0]),
218 vreinterpretq_u16_u8(s8
[1].val
[0]));
219 s16
[1] = vzipq_u16(vreinterpretq_u16_u8(s8
[2].val
[0]),
220 vreinterpretq_u16_u8(s8
[3].val
[0]));
221 s16
[2] = vzipq_u16(vreinterpretq_u16_u8(s8
[0].val
[1]),
222 vreinterpretq_u16_u8(s8
[1].val
[1]));
223 s16
[3] = vzipq_u16(vreinterpretq_u16_u8(s8
[2].val
[1]),
224 vreinterpretq_u16_u8(s8
[3].val
[1]));
226 s32
[0] = vzipq_u32(vreinterpretq_u32_u16(s16
[0].val
[0]),
227 vreinterpretq_u32_u16(s16
[1].val
[0]));
228 s32
[1] = vzipq_u32(vreinterpretq_u32_u16(s16
[0].val
[1]),
229 vreinterpretq_u32_u16(s16
[1].val
[1]));
230 s32
[2] = vzipq_u32(vreinterpretq_u32_u16(s16
[2].val
[0]),
231 vreinterpretq_u32_u16(s16
[3].val
[0]));
232 s32
[3] = vzipq_u32(vreinterpretq_u32_u16(s16
[2].val
[1]),
233 vreinterpretq_u32_u16(s16
[3].val
[1]));
235 for (k
= 0; k
< 8; k
++) {
236 st
[k
] = vreinterpretq_u64_u32(s32
[k
>> 1].val
[k
& 1]);
240 for (i
= 0; i
< 8; i
++) {
241 uint64x2_t t1
= vld1q_u64(dst
);
242 vst1q_u64(dst
, veorq_u64(st
[i
], t1
));
246 for (i
= 0; i
< 8; i
++) {
247 vst1q_u64(dst
, st
[i
]);
257 gf_w64_neon_split_4_lazy_multiply_region(gf_t
*gf
, void *src
, void *dest
,
258 uint64_t val
, int bytes
, int xor,
263 uint64_t pp
, v
, *s64
, *d64
, *top
;
264 struct gf_split_4_64_lazy_data
*ld
;
267 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
268 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
270 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 128);
271 gf_do_initial_region_alignment(&rd
);
273 s64
= (uint64_t *) rd
.s_start
;
274 d64
= (uint64_t *) rd
.d_start
;
275 top
= (uint64_t *) rd
.d_top
;
277 h
= (gf_internal_t
*) gf
->scratch
;
279 ld
= (struct gf_split_4_64_lazy_data
*) h
->private;
282 for (i
= 0; i
< 16; i
++) {
283 ld
->tables
[i
][0] = 0;
284 for (j
= 1; j
< 16; j
<<= 1) {
285 for (k
= 0; k
< j
; k
++) {
286 ld
->tables
[i
][k
^j
] = (v
^ ld
->tables
[i
][k
]);
288 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
294 neon_w64_split_4_lazy_altmap_multiply_region(gf
, s64
, d64
, top
, val
, 1);
296 neon_w64_split_4_lazy_altmap_multiply_region(gf
, s64
, d64
, top
, val
, 0);
299 neon_w64_split_4_lazy_multiply_region(gf
, s64
, d64
, top
, val
, 1);
301 neon_w64_split_4_lazy_multiply_region(gf
, s64
, d64
, top
, val
, 0);
304 gf_do_final_region_alignment(&rd
);
309 gf_w64_split_4_64_lazy_multiply_region_neon(gf_t
*gf
, void *src
, void *dest
,
310 uint64_t val
, int bytes
, int xor)
312 gf_w64_neon_split_4_lazy_multiply_region(gf
, src
, dest
, val
, bytes
, xor, 0);
317 gf_w64_split_4_64_lazy_altmap_multiply_region_neon(gf_t
*gf
, void *src
,
318 void *dest
, uint64_t val
,
321 gf_w64_neon_split_4_lazy_multiply_region(gf
, src
, dest
, val
, bytes
, xor, 1);
324 void gf_w64_neon_split_init(gf_t
*gf
)
326 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
328 if (h
->region_type
& GF_REGION_ALTMAP
)
329 SET_FUNCTION(gf
,multiply_region
,w64
,gf_w64_split_4_64_lazy_altmap_multiply_region_neon
)
331 SET_FUNCTION(gf
,multiply_region
,w64
,gf_w64_split_4_64_lazy_multiply_region_neon
)