]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/jerasure/gf-complete/src/gf_w8.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / erasure-code / jerasure / gf-complete / src / gf_w8.c
1 /*
2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5 *
6 * gf_w8.c
7 *
8 * Routines for 8-bit Galois fields
9 */
10
11 #include "gf_int.h"
12 #include "gf_w8.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <assert.h>
16 #include "gf_cpu.h"
17
18 #define AB2(ip, am1 ,am2, b, t1, t2) {\
19 t1 = (b << 1) & am1;\
20 t2 = b & am2; \
21 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
22 b = (t1 ^ (t2 & ip));}
23
24 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
25 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
26 t2 = _mm_and_si128(va, m2); \
27 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
28 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
29
30 #define MM_PRINT(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 2) printf(" %02x %02x", blah[15-ii], blah[14-ii]); printf("\n"); }
31
32 static
33 inline
34 uint32_t gf_w8_inverse_from_divide (gf_t *gf, uint32_t a)
35 {
36 return gf->divide.w32(gf, 1, a);
37 }
38
39 static
40 inline
41 uint32_t gf_w8_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
42 {
43 b = gf->inverse.w32(gf, b);
44 return gf->multiply.w32(gf, a, b);
45 }
46
47 static
48 inline
49 uint32_t gf_w8_euclid (gf_t *gf, uint32_t b)
50 {
51 uint32_t e_i, e_im1, e_ip1;
52 uint32_t d_i, d_im1, d_ip1;
53 uint32_t y_i, y_im1, y_ip1;
54 uint32_t c_i;
55
56 if (b == 0) return -1;
57 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
58 e_i = b;
59 d_im1 = 8;
60 for (d_i = d_im1; ((1 << d_i) & e_i) == 0; d_i--) ;
61 y_i = 1;
62 y_im1 = 0;
63
64 while (e_i != 1) {
65
66 e_ip1 = e_im1;
67 d_ip1 = d_im1;
68 c_i = 0;
69
70 while (d_ip1 >= d_i) {
71 c_i ^= (1 << (d_ip1 - d_i));
72 e_ip1 ^= (e_i << (d_ip1 - d_i));
73 if (e_ip1 == 0) return 0;
74 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
75 }
76
77 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
78 y_im1 = y_i;
79 y_i = y_ip1;
80
81 e_im1 = e_i;
82 d_im1 = d_i;
83 e_i = e_ip1;
84 d_i = d_ip1;
85 }
86
87 return y_i;
88 }
89
90 static
91 gf_val_32_t gf_w8_extract_word(gf_t *gf, void *start, int bytes, int index)
92 {
93 uint8_t *r8;
94
95 r8 = (uint8_t *) start;
96 return r8[index];
97 }
98
99 static
100 gf_val_32_t gf_w8_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
101 {
102 int sub_size;
103 gf_internal_t *h;
104 uint8_t *r8, *top;
105 uint8_t a, b;
106 gf_region_data rd;
107
108 h = (gf_internal_t *) gf->scratch;
109 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
110 r8 = (uint8_t *) start;
111 if (r8 + index < (uint8_t *) rd.d_start) return r8[index];
112 if (r8 + index >= (uint8_t *) rd.d_top) return r8[index];
113 index -= (((uint8_t *) rd.d_start) - r8);
114 r8 = (uint8_t *) rd.d_start;
115 top = (uint8_t *) rd.d_top;
116 sub_size = (top-r8)/2;
117
118 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
119 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
120 return (a | (b << 4));
121 }
122
123 static
124 inline
125 uint32_t gf_w8_matrix (gf_t *gf, uint32_t b)
126 {
127 return gf_bitmatrix_inverse(b, 8, ((gf_internal_t *) (gf->scratch))->prim_poly);
128 }
129
130
131 #if defined(INTEL_SSE4_PCLMUL)
132 static
133 inline
134 gf_val_32_t
135 gf_w8_clm_multiply_2 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
136 {
137 gf_val_32_t rv = 0;
138
139 __m128i a, b;
140 __m128i result;
141 __m128i prim_poly;
142 __m128i w;
143 gf_internal_t * h = gf->scratch;
144
145 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
146 b = _mm_insert_epi32 (a, b8, 0);
147
148 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
149
150 /* Do the initial multiply */
151
152 result = _mm_clmulepi64_si128 (a, b, 0);
153
154 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
155 have to do the reduction at most twice, because (w-2)/z == 2. Where
156 z is equal to the number of zeros after the leading 1
157
158 _mm_clmulepi64_si128 is the carryless multiply operation. Here
159 _mm_srli_si128 shifts the result to the right by 1 byte. This allows
160 us to multiply the prim_poly by the leading bits of the result. We
161 then xor the result of that operation back with the result.*/
162
163 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
164 result = _mm_xor_si128 (result, w);
165 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
166 result = _mm_xor_si128 (result, w);
167
168 /* Extracts 32 bit value from result. */
169
170 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
171
172 return rv;
173 }
174 #endif
175
176 #if defined(INTEL_SSE4_PCLMUL)
177 static
178 inline
179 gf_val_32_t
180 gf_w8_clm_multiply_3 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
181 {
182 gf_val_32_t rv = 0;
183
184 __m128i a, b;
185 __m128i result;
186 __m128i prim_poly;
187 __m128i w;
188 gf_internal_t * h = gf->scratch;
189
190 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
191 b = _mm_insert_epi32 (a, b8, 0);
192
193 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
194
195 /* Do the initial multiply */
196
197 result = _mm_clmulepi64_si128 (a, b, 0);
198
199 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
200 result = _mm_xor_si128 (result, w);
201 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
202 result = _mm_xor_si128 (result, w);
203 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
204 result = _mm_xor_si128 (result, w);
205
206 /* Extracts 32 bit value from result. */
207
208 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
209
210 return rv;
211 }
212 #endif
213
214 #if defined(INTEL_SSE4_PCLMUL)
215 static
216 inline
217 gf_val_32_t
218 gf_w8_clm_multiply_4 (gf_t *gf, gf_val_32_t a8, gf_val_32_t b8)
219 {
220 gf_val_32_t rv = 0;
221
222 __m128i a, b;
223 __m128i result;
224 __m128i prim_poly;
225 __m128i w;
226 gf_internal_t * h = gf->scratch;
227
228 a = _mm_insert_epi32 (_mm_setzero_si128(), a8, 0);
229 b = _mm_insert_epi32 (a, b8, 0);
230
231 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
232
233 /* Do the initial multiply */
234
235 result = _mm_clmulepi64_si128 (a, b, 0);
236
237 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
238 result = _mm_xor_si128 (result, w);
239 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
240 result = _mm_xor_si128 (result, w);
241 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
242 result = _mm_xor_si128 (result, w);
243 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
244 result = _mm_xor_si128 (result, w);
245
246 /* Extracts 32 bit value from result. */
247 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
248
249 return rv;
250 }
251 #endif
252
253
254 static
255 void
256 gf_w8_multiply_region_from_single(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
257 xor)
258 {
259 gf_region_data rd;
260 uint8_t *s8;
261 uint8_t *d8;
262
263 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
264 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
265
266 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
267 gf_do_initial_region_alignment(&rd);
268
269 s8 = (uint8_t *) rd.s_start;
270 d8 = (uint8_t *) rd.d_start;
271
272 if (xor) {
273 while (d8 < ((uint8_t *) rd.d_top)) {
274 *d8 ^= gf->multiply.w32(gf, val, *s8);
275 d8++;
276 s8++;
277 }
278 } else {
279 while (d8 < ((uint8_t *) rd.d_top)) {
280 *d8 = gf->multiply.w32(gf, val, *s8);
281 d8++;
282 s8++;
283 }
284 }
285 gf_do_final_region_alignment(&rd);
286 }
287
288 #if defined(INTEL_SSE4_PCLMUL)
289 static
290 void
291 gf_w8_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
292 xor)
293 {
294 gf_region_data rd;
295 uint8_t *s8;
296 uint8_t *d8;
297
298 __m128i a, b;
299 __m128i result;
300 __m128i prim_poly;
301 __m128i w;
302 gf_internal_t * h = gf->scratch;
303
304 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
305
306 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
307 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
308
309 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
310
311 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
312 gf_do_initial_region_alignment(&rd);
313
314 s8 = (uint8_t *) rd.s_start;
315 d8 = (uint8_t *) rd.d_start;
316
317 if (xor) {
318 while (d8 < ((uint8_t *) rd.d_top)) {
319 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
320 result = _mm_clmulepi64_si128 (a, b, 0);
321 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
322 result = _mm_xor_si128 (result, w);
323 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
324 result = _mm_xor_si128 (result, w);
325 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
326 d8++;
327 s8++;
328 }
329 } else {
330 while (d8 < ((uint8_t *) rd.d_top)) {
331 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
332 result = _mm_clmulepi64_si128 (a, b, 0);
333 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
334 result = _mm_xor_si128 (result, w);
335 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
336 result = _mm_xor_si128 (result, w);
337 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
338 d8++;
339 s8++;
340 }
341 }
342 gf_do_final_region_alignment(&rd);
343 }
344 #endif
345
346 #if defined(INTEL_SSE4_PCLMUL)
347 static
348 void
349 gf_w8_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
350 xor)
351 {
352 gf_region_data rd;
353 uint8_t *s8;
354 uint8_t *d8;
355
356 __m128i a, b;
357 __m128i result;
358 __m128i prim_poly;
359 __m128i w;
360 gf_internal_t * h = gf->scratch;
361
362 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
363
364 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
365 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
366
367 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
368
369 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
370 gf_do_initial_region_alignment(&rd);
371
372 s8 = (uint8_t *) rd.s_start;
373 d8 = (uint8_t *) rd.d_start;
374
375 if (xor) {
376 while (d8 < ((uint8_t *) rd.d_top)) {
377 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
378 result = _mm_clmulepi64_si128 (a, b, 0);
379 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
380 result = _mm_xor_si128 (result, w);
381 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
382 result = _mm_xor_si128 (result, w);
383 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
384 result = _mm_xor_si128 (result, w);
385 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
386 d8++;
387 s8++;
388 }
389 } else {
390 while (d8 < ((uint8_t *) rd.d_top)) {
391 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
392 result = _mm_clmulepi64_si128 (a, b, 0);
393 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
394 result = _mm_xor_si128 (result, w);
395 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
396 result = _mm_xor_si128 (result, w);
397 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
398 result = _mm_xor_si128 (result, w);
399 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
400 d8++;
401 s8++;
402 }
403 }
404 gf_do_final_region_alignment(&rd);
405 }
406 #endif
407
408 #if defined(INTEL_SSE4_PCLMUL)
409 static
410 void
411 gf_w8_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int
412 xor)
413 {
414 gf_region_data rd;
415 uint8_t *s8;
416 uint8_t *d8;
417
418 __m128i a, b;
419 __m128i result;
420 __m128i prim_poly;
421 __m128i w;
422 gf_internal_t * h = gf->scratch;
423
424 prim_poly = _mm_set_epi32(0, 0, 0, (uint32_t)(h->prim_poly & 0x1ffULL));
425
426 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
427 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
428
429 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
430
431 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
432 gf_do_initial_region_alignment(&rd);
433
434 s8 = (uint8_t *) rd.s_start;
435 d8 = (uint8_t *) rd.d_start;
436
437 if (xor) {
438 while (d8 < ((uint8_t *) rd.d_top)) {
439 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
440 result = _mm_clmulepi64_si128 (a, b, 0);
441 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
442 result = _mm_xor_si128 (result, w);
443 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
444 result = _mm_xor_si128 (result, w);
445 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
446 result = _mm_xor_si128 (result, w);
447 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
448 result = _mm_xor_si128 (result, w);
449 *d8 ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
450 d8++;
451 s8++;
452 }
453 } else {
454 while (d8 < ((uint8_t *) rd.d_top)) {
455 b = _mm_insert_epi32 (a, (gf_val_32_t)(*s8), 0);
456 result = _mm_clmulepi64_si128 (a, b, 0);
457 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
458 result = _mm_xor_si128 (result, w);
459 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
460 result = _mm_xor_si128 (result, w);
461 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
462 result = _mm_xor_si128 (result, w);
463 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 1), 0);
464 result = _mm_xor_si128 (result, w);
465 *d8 = ((gf_val_32_t)_mm_extract_epi32(result, 0));
466 d8++;
467 s8++;
468 }
469 }
470 gf_do_final_region_alignment(&rd);
471 }
472 #endif
473
474 /* ------------------------------------------------------------
475 IMPLEMENTATION: SHIFT:
476
477 JSP: The world's dumbest multiplication algorithm. I only
478 include it for completeness. It does have the feature that it requires no
479 extra memory.
480 */
481
482 static
483 inline
484 uint32_t
485 gf_w8_shift_multiply (gf_t *gf, uint32_t a8, uint32_t b8)
486 {
487 uint16_t product, i, pp, a, b;
488 gf_internal_t *h;
489
490 a = a8;
491 b = b8;
492 h = (gf_internal_t *) gf->scratch;
493 pp = h->prim_poly;
494
495 product = 0;
496
497 for (i = 0; i < GF_FIELD_WIDTH; i++) {
498 if (a & (1 << i)) product ^= (b << i);
499 }
500 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
501 if (product & (1 << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
502 }
503 return product;
504 }
505
506 static
507 int gf_w8_cfm_init(gf_t *gf)
508 {
509 #if defined(INTEL_SSE4_PCLMUL)
510 if (gf_cpu_supports_intel_pclmul) {
511 gf_internal_t *h;
512
513 h = (gf_internal_t *) gf->scratch;
514
515 if ((0xe0 & h->prim_poly) == 0){
516 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_2)
517 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_2)
518 }else if ((0xc0 & h->prim_poly) == 0){
519 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_3)
520 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_3)
521 }else if ((0x80 & h->prim_poly) == 0){
522 SET_FUNCTION(gf,multiply,w32,gf_w8_clm_multiply_4)
523 SET_FUNCTION(gf,multiply_region,w32,gf_w8_clm_multiply_region_from_single_4)
524 }else{
525 return 0;
526 }
527 return 1;
528 }
529 #elif defined(ARM_NEON)
530 if (gf_cpu_supports_arm_neon) {
531 return gf_w8_neon_cfm_init(gf);
532 }
533 #endif
534
535 return 0;
536
537 }
538
539 static
540 int gf_w8_shift_init(gf_t *gf)
541 {
542 SET_FUNCTION(gf,multiply,w32,gf_w8_shift_multiply) /* The others will be set automatically */
543 return 1;
544 }
545
546 /* ------------------------------------------------------------
547 IMPLEMENTATION: LOG_TABLE:
548
549 JSP: Kevin wrote this, and I'm converting it to my structure.
550 */
551
552 static
553 inline
554 uint32_t
555 gf_w8_logzero_multiply (gf_t *gf, uint32_t a, uint32_t b)
556 {
557 struct gf_w8_logzero_table_data *ltd;
558
559 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
560 return ltd->antilog_tbl[ltd->log_tbl[a] + ltd->log_tbl[b]];
561 }
562
563 static
564 inline
565 uint32_t
566 gf_w8_logzero_divide (gf_t *gf, uint32_t a, uint32_t b)
567 {
568 struct gf_w8_logzero_table_data *ltd;
569
570 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
571 return ltd->div_tbl[ltd->log_tbl[a] - ltd->log_tbl[b]];
572 }
573
574 static
575 inline
576 uint32_t
577 gf_w8_logzero_small_multiply (gf_t *gf, uint32_t a, uint32_t b)
578 {
579 struct gf_w8_logzero_small_table_data *std;
580
581 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
582 if (b == 0) return 0;
583 return std->antilog_tbl[std->log_tbl[a] + std->log_tbl[b]];
584 }
585
586 static
587 inline
588 uint32_t
589 gf_w8_logzero_small_divide (gf_t *gf, uint32_t a, uint32_t b)
590 {
591 struct gf_w8_logzero_small_table_data *std;
592
593 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
594 return std->div_tbl[std->log_tbl[a] - std->log_tbl[b]];
595 }
596
597 static
598 inline
599 uint32_t
600 gf_w8_log_multiply (gf_t *gf, uint32_t a, uint32_t b)
601 {
602 struct gf_w8_logtable_data *ltd;
603
604 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
605 return (a == 0 || b == 0) ? 0 : ltd->antilog_tbl[(unsigned)(ltd->log_tbl[a] + ltd->log_tbl[b])];
606 }
607
608 static
609 inline
610 uint32_t
611 gf_w8_log_divide (gf_t *gf, uint32_t a, uint32_t b)
612 {
613 int log_sum = 0;
614 struct gf_w8_logtable_data *ltd;
615
616 if (a == 0 || b == 0) return 0;
617 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
618
619 log_sum = ltd->log_tbl[a] - ltd->log_tbl[b] + (GF_MULT_GROUP_SIZE);
620 return (ltd->antilog_tbl[log_sum]);
621 }
622
623 static
624 uint32_t
625 gf_w8_log_inverse (gf_t *gf, uint32_t a)
626 {
627 struct gf_w8_logtable_data *ltd;
628
629 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
630 return (ltd->inv_tbl[a]);
631 }
632
633 static
634 uint32_t
635 gf_w8_logzero_inverse (gf_t *gf, uint32_t a)
636 {
637 struct gf_w8_logzero_table_data *ltd;
638
639 ltd = (struct gf_w8_logzero_table_data *) ((gf_internal_t *) gf->scratch)->private;
640 return (ltd->inv_tbl[a]);
641 }
642
643 static
644 uint32_t
645 gf_w8_logzero_small_inverse (gf_t *gf, uint32_t a)
646 {
647 struct gf_w8_logzero_small_table_data *std;
648
649 std = (struct gf_w8_logzero_small_table_data *) ((gf_internal_t *) gf->scratch)->private;
650 return (std->inv_tbl[a]);
651 }
652
653 static
654 void
655 gf_w8_log_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
656 {
657 int i;
658 uint8_t lv;
659 uint8_t *s8, *d8;
660 struct gf_w8_logtable_data *ltd;
661
662 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
663 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
664
665 ltd = (struct gf_w8_logtable_data *) ((gf_internal_t *) gf->scratch)->private;
666 s8 = (uint8_t *) src;
667 d8 = (uint8_t *) dest;
668
669 lv = ltd->log_tbl[val];
670
671 if (xor) {
672 for (i = 0; i < bytes; i++) {
673 d8[i] ^= (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
674 }
675 } else {
676 for (i = 0; i < bytes; i++) {
677 d8[i] = (s8[i] == 0 ? 0 : ltd->antilog_tbl[lv + ltd->log_tbl[s8[i]]]);
678 }
679 }
680 }
681
682 static
683 void
684 gf_w8_logzero_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
685 {
686 int i;
687 uint8_t lv;
688 uint8_t *s8, *d8;
689 struct gf_w8_logzero_table_data *ltd;
690 struct gf_w8_logzero_small_table_data *std;
691 short *log;
692 uint8_t *alt;
693 gf_internal_t *h;
694
695 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
696 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
697
698 h = (gf_internal_t *) gf->scratch;
699
700 if (h->arg1 == 1) {
701 std = (struct gf_w8_logzero_small_table_data *) h->private;
702 log = std->log_tbl;
703 alt = std->antilog_tbl;
704 } else {
705 ltd = (struct gf_w8_logzero_table_data *) h->private;
706 log = ltd->log_tbl;
707 alt = ltd->antilog_tbl;
708 }
709 s8 = (uint8_t *) src;
710 d8 = (uint8_t *) dest;
711
712 lv = log[val];
713
714 if (xor) {
715 for (i = 0; i < bytes; i++) {
716 d8[i] ^= (alt[lv + log[s8[i]]]);
717 }
718 } else {
719 for (i = 0; i < bytes; i++) {
720 d8[i] = (alt[lv + log[s8[i]]]);
721 }
722 }
723 }
724
725 static
726 int gf_w8_log_init(gf_t *gf)
727 {
728 gf_internal_t *h;
729 struct gf_w8_logtable_data *ltd = NULL;
730 struct gf_w8_logzero_table_data *ztd = NULL;
731 struct gf_w8_logzero_small_table_data *std = NULL;
732 uint8_t *alt;
733 uint8_t *inv;
734 int i, b;
735 int check = 0;
736
737 h = (gf_internal_t *) gf->scratch;
738 if (h->mult_type == GF_MULT_LOG_TABLE) {
739 ltd = h->private;
740 alt = ltd->antilog_tbl;
741 inv = ltd->inv_tbl;
742 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
743 std = h->private;
744 alt = std->antilog_tbl;
745 std->div_tbl = (alt + 255);
746 inv = std->inv_tbl;
747 } else {
748 ztd = h->private;
749 alt = ztd->antilog_tbl;
750 ztd->inv_tbl = (alt + 512 + 256);
751 ztd->div_tbl = (alt + 255);
752 inv = ztd->inv_tbl;
753 }
754
755 for (i = 0; i < GF_MULT_GROUP_SIZE+1; i++) {
756 if (h->mult_type == GF_MULT_LOG_TABLE)
757 ltd->log_tbl[i] = 0;
758 else if (h->mult_type == GF_MULT_LOG_ZERO)
759 std->log_tbl[i] = 0;
760 else
761 ztd->log_tbl[i] = 0;
762 }
763
764 if (h->mult_type == GF_MULT_LOG_TABLE) {
765 ltd->log_tbl[0] = 0;
766 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
767 std->log_tbl[0] = 510;
768 } else {
769 ztd->log_tbl[0] = 512;
770 }
771
772 b = 1;
773 for (i = 0; i < GF_MULT_GROUP_SIZE; i++) {
774 if (h->mult_type == GF_MULT_LOG_TABLE) {
775 if (ltd->log_tbl[b] != 0) check = 1;
776 ltd->log_tbl[b] = i;
777 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
778 if (std->log_tbl[b] != 0) check = 1;
779 std->log_tbl[b] = i;
780 } else {
781 if (ztd->log_tbl[b] != 0) check = 1;
782 ztd->log_tbl[b] = i;
783 }
784 alt[i] = b;
785 alt[i+GF_MULT_GROUP_SIZE] = b;
786 b <<= 1;
787 if (b & GF_FIELD_SIZE) {
788 b = b ^ h->prim_poly;
789 }
790 }
791 if (check) {
792 _gf_errno = GF_E_LOGPOLY;
793 return 0;
794 }
795
796 if (h->mult_type == GF_MULT_LOG_ZERO) bzero(alt+510, 255);
797
798 if (h->mult_type == GF_MULT_LOG_ZERO_EXT) {
799 bzero(alt+512, 255);
800 alt[512+512] = 0;
801 }
802
803 inv[0] = 0; /* Not really, but we need to fill it with something */
804 i = 1;
805 b = GF_MULT_GROUP_SIZE;
806 do {
807 inv[i] = alt[b];
808 i <<= 1;
809 if (i & (1 << 8)) i ^= h->prim_poly;
810 b--;
811 } while (i != 1);
812
813 if (h->mult_type == GF_MULT_LOG_TABLE) {
814 SET_FUNCTION(gf,inverse,w32,gf_w8_log_inverse)
815 SET_FUNCTION(gf,divide,w32,gf_w8_log_divide)
816 SET_FUNCTION(gf,multiply,w32,gf_w8_log_multiply)
817 SET_FUNCTION(gf,multiply_region,w32,gf_w8_log_multiply_region)
818 } else if (h->mult_type == GF_MULT_LOG_ZERO) {
819 SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_small_inverse)
820 SET_FUNCTION(gf,divide,w32,gf_w8_logzero_small_divide)
821 SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_small_multiply)
822 SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
823 } else {
824 SET_FUNCTION(gf,inverse,w32,gf_w8_logzero_inverse)
825 SET_FUNCTION(gf,divide,w32,gf_w8_logzero_divide)
826 SET_FUNCTION(gf,multiply,w32,gf_w8_logzero_multiply)
827 SET_FUNCTION(gf,multiply_region,w32,gf_w8_logzero_multiply_region)
828 }
829 return 1;
830 }
831
832 /* ------------------------------------------------------------
833 IMPLEMENTATION: FULL_TABLE:
834
835 JSP: Kevin wrote this, and I'm converting it to my structure.
836 */
837
838 static
839 gf_val_32_t
840 gf_w8_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
841 {
842 struct gf_w8_single_table_data *ftd;
843
844 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
845 return (ftd->multtable[a][b]);
846 }
847
848 static
849 gf_val_32_t
850 gf_w8_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
851 {
852 struct gf_w8_single_table_data *ftd;
853
854 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
855 return (ftd->divtable[a][b]);
856 }
857
858 static
859 gf_val_32_t
860 gf_w8_default_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
861 {
862 struct gf_w8_default_data *ftd;
863
864 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
865 return (ftd->multtable[a][b]);
866 }
867
868 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
869 static
870 gf_val_32_t
871 gf_w8_default_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
872 {
873 struct gf_w8_default_data *ftd;
874
875 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
876 return (ftd->divtable[a][b]);
877 }
878 #endif
879
880 static
881 gf_val_32_t
882 gf_w8_double_table_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
883 {
884 struct gf_w8_double_table_data *ftd;
885
886 ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
887 return (ftd->mult[a][b]);
888 }
889
890 static
891 gf_val_32_t
892 gf_w8_double_table_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
893 {
894 struct gf_w8_double_table_data *ftd;
895
896 ftd = (struct gf_w8_double_table_data *) ((gf_internal_t *) gf->scratch)->private;
897 return (ftd->div[a][b]);
898 }
899
900 static
901 void
902 gf_w8_double_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
903 {
904 uint16_t *base;
905 uint32_t b, c, vc, vb;
906 gf_internal_t *h;
907 struct gf_w8_double_table_data *dtd;
908 struct gf_w8_double_table_lazy_data *ltd;
909 gf_region_data rd;
910
911 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
912 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
913
914 h = (gf_internal_t *) (gf->scratch);
915 if (h->region_type & GF_REGION_LAZY) {
916 ltd = (struct gf_w8_double_table_lazy_data *) h->private;
917 base = ltd->mult;
918 for (b = 0; b < GF_FIELD_SIZE; b++) {
919 vb = (ltd->smult[val][b] << 8);
920 for (c = 0; c < GF_FIELD_SIZE; c++) {
921 vc = ltd->smult[val][c];
922 base[(b << 8)| c] = (vb | vc);
923 }
924 }
925
926 } else {
927 dtd = (struct gf_w8_double_table_data *) h->private;
928 base = &(dtd->mult[val][0]);
929 }
930
931 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
932 gf_do_initial_region_alignment(&rd);
933 gf_two_byte_region_table_multiply(&rd, base);
934 gf_do_final_region_alignment(&rd);
935 }
936
937 static
938 gf_val_32_t
939 gf_w8_double_table_lazy_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
940 {
941 struct gf_w8_double_table_lazy_data *ftd;
942
943 ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
944 return (ftd->smult[a][b]);
945 }
946
947 static
948 gf_val_32_t
949 gf_w8_double_table_lazy_divide(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
950 {
951 struct gf_w8_double_table_lazy_data *ftd;
952
953 ftd = (struct gf_w8_double_table_lazy_data *) ((gf_internal_t *) gf->scratch)->private;
954 return (ftd->div[a][b]);
955 }
956
957 static
958 void
959 gf_w8_table_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
960 {
961 int i;
962 uint8_t *s8, *d8;
963 struct gf_w8_single_table_data *ftd;
964
965 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
966 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
967
968 ftd = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
969 s8 = (uint8_t *) src;
970 d8 = (uint8_t *) dest;
971
972 if (xor) {
973 for (i = 0; i < bytes; i++) {
974 d8[i] ^= ftd->multtable[s8[i]][val];
975 }
976 } else {
977 for (i = 0; i < bytes; i++) {
978 d8[i] = ftd->multtable[s8[i]][val];
979 }
980 }
981 }
982
983 #ifdef INTEL_SSSE3
984 static
985 void
986 gf_w8_split_multiply_region_sse(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
987 {
988 uint8_t *bh, *bl, *sptr, *dptr;
989 __m128i loset, t1, r, va, mth, mtl;
990 struct gf_w8_half_table_data *htd;
991 gf_region_data rd;
992
993 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
994 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
995
996 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) (gf->scratch))->private;
997
998 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
999 gf_do_initial_region_alignment(&rd);
1000
1001 bh = (uint8_t *) htd->high;
1002 bh += (val << 4);
1003 bl = (uint8_t *) htd->low;
1004 bl += (val << 4);
1005
1006 sptr = rd.s_start;
1007 dptr = rd.d_start;
1008
1009 mth = _mm_loadu_si128 ((__m128i *)(bh));
1010 mtl = _mm_loadu_si128 ((__m128i *)(bl));
1011 loset = _mm_set1_epi8 (0x0f);
1012
1013 if (xor) {
1014 while (sptr < (uint8_t *) rd.s_top) {
1015 va = _mm_load_si128 ((__m128i *)(sptr));
1016 t1 = _mm_and_si128 (loset, va);
1017 r = _mm_shuffle_epi8 (mtl, t1);
1018 va = _mm_srli_epi64 (va, 4);
1019 t1 = _mm_and_si128 (loset, va);
1020 r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1021 va = _mm_load_si128 ((__m128i *)(dptr));
1022 r = _mm_xor_si128 (r, va);
1023 _mm_store_si128 ((__m128i *)(dptr), r);
1024 dptr += 16;
1025 sptr += 16;
1026 }
1027 } else {
1028 while (sptr < (uint8_t *) rd.s_top) {
1029 va = _mm_load_si128 ((__m128i *)(sptr));
1030 t1 = _mm_and_si128 (loset, va);
1031 r = _mm_shuffle_epi8 (mtl, t1);
1032 va = _mm_srli_epi64 (va, 4);
1033 t1 = _mm_and_si128 (loset, va);
1034 r = _mm_xor_si128 (r, _mm_shuffle_epi8 (mth, t1));
1035 _mm_store_si128 ((__m128i *)(dptr), r);
1036 dptr += 16;
1037 sptr += 16;
1038 }
1039 }
1040
1041 gf_do_final_region_alignment(&rd);
1042 }
1043 #endif
1044
1045
1046 /* ------------------------------------------------------------
1047 IMPLEMENTATION: FULL_TABLE:
1048 */
1049
1050 static
1051 gf_val_32_t
1052 gf_w8_split_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1053 {
1054 struct gf_w8_half_table_data *htd;
1055 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1056
1057 return htd->high[b][a>>4] ^ htd->low[b][a&0xf];
1058 }
1059
1060 static
1061 void
1062 gf_w8_split_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1063 {
1064 int i;
1065 uint8_t *s8, *d8;
1066 struct gf_w8_half_table_data *htd;
1067
1068 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1069 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1070
1071 htd = (struct gf_w8_half_table_data *) ((gf_internal_t *) gf->scratch)->private;
1072 s8 = (uint8_t *) src;
1073 d8 = (uint8_t *) dest;
1074
1075 if (xor) {
1076 for (i = 0; i < bytes; i++) {
1077 d8[i] ^= (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1078 }
1079 } else {
1080 for (i = 0; i < bytes; i++) {
1081 d8[i] = (htd->high[val][s8[i]>>4] ^ htd->low[val][s8[i]&0xf]);
1082 }
1083 }
1084 }
1085
1086
1087 static
1088 int gf_w8_split_init(gf_t *gf)
1089 {
1090 gf_internal_t *h;
1091 struct gf_w8_half_table_data *htd;
1092 int a, b;
1093
1094 h = (gf_internal_t *) gf->scratch;
1095 htd = (struct gf_w8_half_table_data *)h->private;
1096
1097 bzero(htd->high, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1098 bzero(htd->low, sizeof(uint8_t)*GF_FIELD_SIZE*GF_HALF_SIZE);
1099
1100 for (a = 1; a < GF_FIELD_SIZE; a++) {
1101 for (b = 1; b < GF_HALF_SIZE; b++) {
1102 htd->low[a][b] = gf_w8_shift_multiply(gf,a,b);
1103 htd->high[a][b] = gf_w8_shift_multiply(gf,a,b<<4);
1104 }
1105 }
1106
1107 SET_FUNCTION(gf,multiply,w32,gf_w8_split_multiply)
1108
1109 #if defined(INTEL_SSSE3)
1110 if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
1111 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1112 } else {
1113 #elif defined(ARM_NEON)
1114 if (gf_cpu_supports_arm_neon && !(h->region_type & GF_REGION_NOSIMD)) {
1115 gf_w8_neon_split_init(gf);
1116 } else {
1117 #endif
1118 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region)
1119 if(h->region_type & GF_REGION_SIMD)
1120 return 0;
1121 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1122 }
1123 #endif
1124
1125 return 1;
1126 }
1127
1128 /* JSP: This is disgusting, but it is what it is. If there is no SSE,
1129 then the default is equivalent to single table. If there is SSE, then
1130 we use the "gf_w8_default_data" which is a hybrid of SPLIT & TABLE. */
1131
1132 static
1133 int gf_w8_table_init(gf_t *gf)
1134 {
1135 gf_internal_t *h;
1136 struct gf_w8_single_table_data *ftd = NULL;
1137 struct gf_w8_double_table_data *dtd = NULL;
1138 struct gf_w8_double_table_lazy_data *ltd = NULL;
1139 struct gf_w8_default_data *dd = NULL;
1140 int a, b, c, prod, scase;
1141
1142 h = (gf_internal_t *) gf->scratch;
1143
1144 if (h->mult_type == GF_MULT_DEFAULT &&
1145 (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
1146 dd = (struct gf_w8_default_data *)h->private;
1147 scase = 3;
1148 bzero(dd->high, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1149 bzero(dd->low, sizeof(uint8_t) * GF_FIELD_SIZE * GF_HALF_SIZE);
1150 bzero(dd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1151 bzero(dd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1152 } else if (h->mult_type == GF_MULT_DEFAULT ||
1153 h->region_type == 0 || (h->region_type & GF_REGION_CAUCHY)) {
1154 ftd = (struct gf_w8_single_table_data *)h->private;
1155 bzero(ftd->divtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1156 bzero(ftd->multtable, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1157 scase = 0;
1158 } else if (h->region_type == GF_REGION_DOUBLE_TABLE) {
1159 dtd = (struct gf_w8_double_table_data *)h->private;
1160 bzero(dtd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1161 bzero(dtd->mult, sizeof(uint16_t) * GF_FIELD_SIZE * GF_FIELD_SIZE * GF_FIELD_SIZE);
1162 scase = 1;
1163 } else if (h->region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
1164 ltd = (struct gf_w8_double_table_lazy_data *)h->private;
1165 bzero(ltd->div, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1166 bzero(ltd->smult, sizeof(uint8_t) * GF_FIELD_SIZE * GF_FIELD_SIZE);
1167 scase = 2;
1168 } else {
1169 fprintf(stderr, "Internal error in gf_w8_table_init\n");
1170 assert(0);
1171 }
1172
1173 for (a = 1; a < GF_FIELD_SIZE; a++) {
1174 for (b = 1; b < GF_FIELD_SIZE; b++) {
1175 prod = gf_w8_shift_multiply(gf,a,b);
1176 switch (scase) {
1177 case 0:
1178 ftd->multtable[a][b] = prod;
1179 ftd->divtable[prod][b] = a;
1180 break;
1181 case 1:
1182 dtd->div[prod][b] = a;
1183 for (c = 0; c < GF_FIELD_SIZE; c++) {
1184 dtd->mult[a][(c<<8)|b] |= prod;
1185 dtd->mult[a][(b<<8)|c] |= (prod<<8);
1186 }
1187 break;
1188 case 2:
1189 ltd->div[prod][b] = a;
1190 ltd->smult[a][b] = prod;
1191 break;
1192 case 3:
1193 dd->multtable[a][b] = prod;
1194 dd->divtable[prod][b] = a;
1195 if ((b & 0xf) == b) { dd->low[a][b] = prod; }
1196 if ((b & 0xf0) == b) { dd->high[a][b>>4] = prod; }
1197 break;
1198 }
1199 }
1200 }
1201
1202 SET_FUNCTION(gf,inverse,w32,NULL) /* Will set from divide */
1203 switch (scase) {
1204 case 0:
1205 SET_FUNCTION(gf,divide,w32,gf_w8_table_divide)
1206 SET_FUNCTION(gf,multiply,w32,gf_w8_table_multiply)
1207 SET_FUNCTION(gf,multiply_region,w32,gf_w8_table_multiply_region)
1208 break;
1209 case 1:
1210 SET_FUNCTION(gf,divide,w32,gf_w8_double_table_divide)
1211 SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_multiply)
1212 SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1213 break;
1214 case 2:
1215 SET_FUNCTION(gf,divide,w32,gf_w8_double_table_lazy_divide)
1216 SET_FUNCTION(gf,multiply,w32,gf_w8_double_table_lazy_multiply)
1217 SET_FUNCTION(gf,multiply_region,w32,gf_w8_double_table_multiply_region)
1218 break;
1219 case 3:
1220 #if defined(INTEL_SSSE3) || defined(ARM_NEON)
1221 if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
1222 SET_FUNCTION(gf,divide,w32,gf_w8_default_divide)
1223 SET_FUNCTION(gf,multiply,w32,gf_w8_default_multiply)
1224 #if defined(INTEL_SSSE3)
1225 if (gf_cpu_supports_intel_ssse3) {
1226 SET_FUNCTION(gf,multiply_region,w32,gf_w8_split_multiply_region_sse)
1227 }
1228 #elif defined(ARM_NEON)
1229 if (gf_cpu_supports_arm_neon) {
1230 gf_w8_neon_split_init(gf);
1231 }
1232 #endif
1233 }
1234 #endif
1235 break;
1236 }
1237 return 1;
1238 }
1239
1240 static
1241 void
1242 gf_w8_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1243 {
1244 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1245 gf_t *base_gf = h->base_gf;
1246 uint8_t val0 = val & 0x0f;
1247 uint8_t val1 = (val & 0xf0) >> 4;
1248 gf_region_data rd;
1249 int sub_reg_size;
1250
1251 if (val == 0) {
1252 if (xor) return;
1253 bzero(dest, bytes);
1254 return;
1255 }
1256
1257 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1258 gf_do_initial_region_alignment(&rd);
1259
1260 sub_reg_size = ((uint8_t *)rd.d_top - (uint8_t *)rd.d_start) / 2;
1261
1262 base_gf->multiply_region.w32(base_gf, rd.s_start, rd.d_start, val0, sub_reg_size, xor);
1263 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, rd.d_start, val1, sub_reg_size, 1);
1264 base_gf->multiply_region.w32(base_gf, rd.s_start, (uint8_t *)rd.d_start+sub_reg_size, val1, sub_reg_size, xor);
1265 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, val0, sub_reg_size, 1);
1266 base_gf->multiply_region.w32(base_gf, (uint8_t *)rd.s_start+sub_reg_size, (uint8_t *)rd.d_start+sub_reg_size, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
1267
1268 gf_do_final_region_alignment(&rd);
1269 }
1270
1271 static
1272 gf_val_32_t
1273 gf_w8_composite_multiply_recursive(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1274 {
1275 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1276 gf_t *base_gf = h->base_gf;
1277 uint8_t b0 = b & 0x0f;
1278 uint8_t b1 = (b & 0xf0) >> 4;
1279 uint8_t a0 = a & 0x0f;
1280 uint8_t a1 = (a & 0xf0) >> 4;
1281 uint8_t a1b1;
1282
1283 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1284
1285 return ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1286 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1287 base_gf->multiply.w32(base_gf, a0, b1) ^
1288 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1289 }
1290
1291 static
1292 gf_val_32_t
1293 gf_w8_composite_multiply_inline(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1294 {
1295 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1296 uint8_t b0 = b & 0x0f;
1297 uint8_t b1 = (b & 0xf0) >> 4;
1298 uint8_t a0 = a & 0x0f;
1299 uint8_t a1 = (a & 0xf0) >> 4;
1300 uint8_t a1b1, *mt;
1301 struct gf_w8_composite_data *cd;
1302
1303 cd = (struct gf_w8_composite_data *) h->private;
1304 mt = cd->mult_table;
1305
1306 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1307
1308 return ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1309 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1310 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1311 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1312 }
1313
1314 /*
1315 * Composite field division trick (explained in 2007 tech report)
1316 *
1317 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
1318 *
1319 * let c = b^-1
1320 *
1321 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
1322 *
1323 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
1324 *
1325 * let d = b1c1 and d+1 = b0c0
1326 *
1327 * solve s*b1c1+b1c0+b0c1 = 0
1328 *
1329 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
1330 *
1331 * c0 = (d+1)b0^-1
1332 * c1 = d*b1^-1
1333 *
1334 * a / b = a * c
1335 */
1336
1337 static
1338 gf_val_32_t
1339 gf_w8_composite_inverse(gf_t *gf, gf_val_32_t a)
1340 {
1341 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1342 gf_t *base_gf = h->base_gf;
1343 uint8_t a0 = a & 0x0f;
1344 uint8_t a1 = (a & 0xf0) >> 4;
1345 uint8_t c0, c1, c, d, tmp;
1346 uint8_t a0inv, a1inv;
1347
1348 if (a0 == 0) {
1349 a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1350 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
1351 c1 = a1inv;
1352 } else if (a1 == 0) {
1353 c0 = base_gf->inverse.w32(base_gf, a0);
1354 c1 = 0;
1355 } else {
1356 a1inv = base_gf->inverse.w32(base_gf, a1) & 0xf;
1357 a0inv = base_gf->inverse.w32(base_gf, a0) & 0xf;
1358
1359 d = base_gf->multiply.w32(base_gf, a1, a0inv) & 0xf;
1360
1361 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly) & 0xf;
1362 tmp = base_gf->inverse.w32(base_gf, tmp) & 0xf;
1363
1364 d = base_gf->multiply.w32(base_gf, d, tmp) & 0xf;
1365
1366 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv) & 0xf;
1367 c1 = base_gf->multiply.w32(base_gf, d, a1inv) & 0xf;
1368 }
1369
1370 c = c0 | (c1 << 4);
1371
1372 return c;
1373 }
1374
1375 static
1376 void
1377 gf_w8_composite_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1378 {
1379 gf_region_data rd;
1380 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1381 gf_t *base_gf = h->base_gf;
1382 uint8_t b0 = val & 0x0f;
1383 uint8_t b1 = (val & 0xf0) >> 4;
1384 uint8_t *s8;
1385 uint8_t *d8;
1386 uint8_t *mt;
1387 uint8_t a0, a1, a1b1;
1388 struct gf_w8_composite_data *cd;
1389
1390 cd = (struct gf_w8_composite_data *) h->private;
1391
1392 if (val == 0) {
1393 if (xor) return;
1394 bzero(dest, bytes);
1395 return;
1396 }
1397
1398 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 1);
1399 gf_do_initial_region_alignment(&rd);
1400
1401
1402 s8 = (uint8_t *) rd.s_start;
1403 d8 = (uint8_t *) rd.d_start;
1404
1405 mt = cd->mult_table;
1406 if (mt == NULL) {
1407 if (xor) {
1408 while (d8 < (uint8_t *) rd.d_top) {
1409 a0 = *s8 & 0x0f;
1410 a1 = (*s8 & 0xf0) >> 4;
1411 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1412
1413 *d8 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1414 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1415 base_gf->multiply.w32(base_gf, a0, b1) ^
1416 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1417 s8++;
1418 d8++;
1419 }
1420 } else {
1421 while (d8 < (uint8_t *) rd.d_top) {
1422 a0 = *s8 & 0x0f;
1423 a1 = (*s8 & 0xf0) >> 4;
1424 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
1425
1426 *d8 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
1427 ((base_gf->multiply.w32(base_gf, a1, b0) ^
1428 base_gf->multiply.w32(base_gf, a0, b1) ^
1429 base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 4));
1430 s8++;
1431 d8++;
1432 }
1433 }
1434 } else {
1435 if (xor) {
1436 while (d8 < (uint8_t *) rd.d_top) {
1437 a0 = *s8 & 0x0f;
1438 a1 = (*s8 & 0xf0) >> 4;
1439 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1440
1441 *d8 ^= ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1442 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1443 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1444 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1445 s8++;
1446 d8++;
1447 }
1448 } else {
1449 while (d8 < (uint8_t *) rd.d_top) {
1450 a0 = *s8 & 0x0f;
1451 a1 = (*s8 & 0xf0) >> 4;
1452 a1b1 = GF_W4_INLINE_MULTDIV(mt, a1, b1);
1453
1454 *d8 = ((GF_W4_INLINE_MULTDIV(mt, a0, b0) ^ a1b1) |
1455 ((GF_W4_INLINE_MULTDIV(mt, a1, b0) ^
1456 GF_W4_INLINE_MULTDIV(mt, a0, b1) ^
1457 GF_W4_INLINE_MULTDIV(mt, a1b1, h->prim_poly)) << 4));
1458 s8++;
1459 d8++;
1460 }
1461 }
1462 }
1463 gf_do_final_region_alignment(&rd);
1464 return;
1465 }
1466
1467 static
1468 int gf_w8_composite_init(gf_t *gf)
1469 {
1470 gf_internal_t *h = (gf_internal_t *) gf->scratch;
1471 struct gf_w8_composite_data *cd;
1472
1473 if (h->base_gf == NULL) return 0;
1474
1475 cd = (struct gf_w8_composite_data *) h->private;
1476 cd->mult_table = gf_w4_get_mult_table(h->base_gf);
1477
1478 if (h->region_type & GF_REGION_ALTMAP) {
1479 SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region_alt)
1480 } else {
1481 SET_FUNCTION(gf,multiply_region,w32,gf_w8_composite_multiply_region)
1482 }
1483
1484 if (cd->mult_table == NULL) {
1485 SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_recursive)
1486 } else {
1487 SET_FUNCTION(gf,multiply,w32,gf_w8_composite_multiply_inline)
1488 }
1489 SET_FUNCTION(gf,divide,w32,NULL)
1490 SET_FUNCTION(gf,inverse,w32,gf_w8_composite_inverse)
1491
1492 return 1;
1493 }
1494
1495 static
1496 inline
1497 gf_val_32_t
1498 gf_w8_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1499 {
1500 uint32_t prod, pp, pmask, amask;
1501 gf_internal_t *h;
1502
1503 h = (gf_internal_t *) gf->scratch;
1504 pp = h->prim_poly;
1505
1506
1507 prod = 0;
1508 pmask = 0x80;
1509 amask = 0x80;
1510
1511 while (amask != 0) {
1512 if (prod & pmask) {
1513 prod = ((prod << 1) ^ pp);
1514 } else {
1515 prod <<= 1;
1516 }
1517 if (a & amask) prod ^= b;
1518 amask >>= 1;
1519 }
1520 return prod;
1521 }
1522
1523 static
1524 inline
1525 gf_val_32_t
1526 gf_w8_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
1527 {
1528 uint32_t prod, pp, bmask;
1529 gf_internal_t *h;
1530
1531 h = (gf_internal_t *) gf->scratch;
1532 pp = h->prim_poly;
1533
1534 prod = 0;
1535 bmask = 0x80;
1536
1537 while (1) {
1538 if (a & 1) prod ^= b;
1539 a >>= 1;
1540 if (a == 0) return prod;
1541 if (b & bmask) {
1542 b = ((b << 1) ^ pp);
1543 } else {
1544 b <<= 1;
1545 }
1546 }
1547 }
1548
1549 static
1550 void
1551 gf_w8_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1552 {
1553 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1554 gf_region_data rd;
1555 struct gf_w8_bytwo_data *btd;
1556
1557 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1558 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1559
1560 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1561
1562 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1563 gf_do_initial_region_alignment(&rd);
1564
1565 s64 = (uint64_t *) rd.s_start;
1566 d64 = (uint64_t *) rd.d_start;
1567
1568 if (xor) {
1569 while (s64 < (uint64_t *) rd.s_top) {
1570 prod = 0;
1571 amask = 0x80;
1572 ta = *s64;
1573 while (amask != 0) {
1574 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1575 if (val & amask) prod ^= ta;
1576 amask >>= 1;
1577 }
1578 *d64 ^= prod;
1579 d64++;
1580 s64++;
1581 }
1582 } else {
1583 while (s64 < (uint64_t *) rd.s_top) {
1584 prod = 0;
1585 amask = 0x80;
1586 ta = *s64;
1587 while (amask != 0) {
1588 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1589 if (val & amask) prod ^= ta;
1590 amask >>= 1;
1591 }
1592 *d64 = prod;
1593 d64++;
1594 s64++;
1595 }
1596 }
1597 gf_do_final_region_alignment(&rd);
1598 }
1599
1600 #define BYTWO_P_ONESTEP {\
1601 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1602 t1 = _mm_and_si128(v, one); \
1603 t1 = _mm_sub_epi8(t1, one); \
1604 t1 = _mm_and_si128(t1, ta); \
1605 prod = _mm_xor_si128(prod, t1); \
1606 v = _mm_srli_epi64(v, 1); }
1607
1608 #ifdef INTEL_SSE2
1609 static
1610 void
1611 gf_w8_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1612 {
1613 int i;
1614 uint8_t *s8, *d8;
1615 uint8_t vrev;
1616 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1617 struct gf_w8_bytwo_data *btd;
1618 gf_region_data rd;
1619
1620 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1621 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1622
1623 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1624
1625 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1626 gf_do_initial_region_alignment(&rd);
1627
1628 vrev = 0;
1629 for (i = 0; i < 8; i++) {
1630 vrev <<= 1;
1631 if (!(val & (1 << i))) vrev |= 1;
1632 }
1633
1634 s8 = (uint8_t *) rd.s_start;
1635 d8 = (uint8_t *) rd.d_start;
1636
1637 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1638 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1639 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1640 one = _mm_set1_epi8(1);
1641
1642 while (d8 < (uint8_t *) rd.d_top) {
1643 prod = _mm_setzero_si128();
1644 v = _mm_set1_epi8(vrev);
1645 ta = _mm_load_si128((__m128i *) s8);
1646 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1647 BYTWO_P_ONESTEP;
1648 BYTWO_P_ONESTEP;
1649 BYTWO_P_ONESTEP;
1650 BYTWO_P_ONESTEP;
1651 BYTWO_P_ONESTEP;
1652 BYTWO_P_ONESTEP;
1653 BYTWO_P_ONESTEP;
1654 BYTWO_P_ONESTEP;
1655 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1656 d8 += 16;
1657 s8 += 16;
1658 }
1659 gf_do_final_region_alignment(&rd);
1660 }
1661 #endif
1662
1663 #ifdef INTEL_SSE2
1664 static
1665 void
1666 gf_w8_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1667 {
1668 uint8_t *d8, *s8;
1669 __m128i pp, m1, m2, t1, t2, va;
1670
1671 s8 = (uint8_t *) rd->s_start;
1672 d8 = (uint8_t *) rd->d_start;
1673
1674 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1675 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1676 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1677
1678 while (d8 < (uint8_t *) rd->d_top) {
1679 va = _mm_load_si128 ((__m128i *)(s8));
1680 SSE_AB2(pp, m1, m2, va, t1, t2);
1681 _mm_store_si128((__m128i *)d8, va);
1682 d8 += 16;
1683 s8 += 16;
1684 }
1685 }
1686 #endif
1687
1688 #ifdef INTEL_SSE2
1689 static
1690 void
1691 gf_w8_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w8_bytwo_data *btd)
1692 {
1693 uint8_t *d8, *s8;
1694 __m128i pp, m1, m2, t1, t2, va, vb;
1695
1696 s8 = (uint8_t *) rd->s_start;
1697 d8 = (uint8_t *) rd->d_start;
1698
1699 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1700 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1701 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1702
1703 while (d8 < (uint8_t *) rd->d_top) {
1704 va = _mm_load_si128 ((__m128i *)(s8));
1705 SSE_AB2(pp, m1, m2, va, t1, t2);
1706 vb = _mm_load_si128 ((__m128i *)(d8));
1707 vb = _mm_xor_si128(vb, va);
1708 _mm_store_si128((__m128i *)d8, vb);
1709 d8 += 16;
1710 s8 += 16;
1711 }
1712 }
1713 #endif
1714
1715
1716 #ifdef INTEL_SSE2
1717 static
1718 void
1719 gf_w8_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1720 {
1721 int itb;
1722 uint8_t *d8, *s8;
1723 __m128i pp, m1, m2, t1, t2, va, vb;
1724 struct gf_w8_bytwo_data *btd;
1725 gf_region_data rd;
1726
1727 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1728 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1729
1730 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1731 gf_do_initial_region_alignment(&rd);
1732
1733 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1734
1735 if (val == 2) {
1736 if (xor) {
1737 gf_w8_bytwo_b_sse_region_2_xor(&rd, btd);
1738 } else {
1739 gf_w8_bytwo_b_sse_region_2_noxor(&rd, btd);
1740 }
1741 gf_do_final_region_alignment(&rd);
1742 return;
1743 }
1744
1745 s8 = (uint8_t *) rd.s_start;
1746 d8 = (uint8_t *) rd.d_start;
1747
1748 pp = _mm_set1_epi8(btd->prim_poly&0xff);
1749 m1 = _mm_set1_epi8((btd->mask1)&0xff);
1750 m2 = _mm_set1_epi8((btd->mask2)&0xff);
1751
1752 while (d8 < (uint8_t *) rd.d_top) {
1753 va = _mm_load_si128 ((__m128i *)(s8));
1754 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1755 itb = val;
1756 while (1) {
1757 if (itb & 1) vb = _mm_xor_si128(vb, va);
1758 itb >>= 1;
1759 if (itb == 0) break;
1760 SSE_AB2(pp, m1, m2, va, t1, t2);
1761 }
1762 _mm_store_si128((__m128i *)d8, vb);
1763 d8 += 16;
1764 s8 += 16;
1765 }
1766
1767 gf_do_final_region_alignment(&rd);
1768 }
1769 #endif
1770
1771 static
1772 void
1773 gf_w8_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1774 {
1775 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1776 struct gf_w8_bytwo_data *btd;
1777 gf_region_data rd;
1778
1779 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1780 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1781
1782 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1783 gf_do_initial_region_alignment(&rd);
1784
1785 btd = (struct gf_w8_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1786 s64 = (uint64_t *) rd.s_start;
1787 d64 = (uint64_t *) rd.d_start;
1788
1789 switch (val) {
1790 case 2:
1791 if (xor) {
1792 while (d64 < (uint64_t *) rd.d_top) {
1793 ta = *s64;
1794 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1795 *d64 ^= ta;
1796 d64++;
1797 s64++;
1798 }
1799 } else {
1800 while (d64 < (uint64_t *) rd.d_top) {
1801 ta = *s64;
1802 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1803 *d64 = ta;
1804 d64++;
1805 s64++;
1806 }
1807 }
1808 break;
1809 case 3:
1810 if (xor) {
1811 while (d64 < (uint64_t *) rd.d_top) {
1812 ta = *s64;
1813 prod = ta;
1814 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1815 *d64 ^= (ta ^ prod);
1816 d64++;
1817 s64++;
1818 }
1819 } else {
1820 while (d64 < (uint64_t *) rd.d_top) {
1821 ta = *s64;
1822 prod = ta;
1823 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1824 *d64 = (ta ^ prod);
1825 d64++;
1826 s64++;
1827 }
1828 }
1829 break;
1830 case 4:
1831 if (xor) {
1832 while (d64 < (uint64_t *) rd.d_top) {
1833 ta = *s64;
1834 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1835 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1836 *d64 ^= ta;
1837 d64++;
1838 s64++;
1839 }
1840 } else {
1841 while (d64 < (uint64_t *) rd.d_top) {
1842 ta = *s64;
1843 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1844 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1845 *d64 = ta;
1846 d64++;
1847 s64++;
1848 }
1849 }
1850 break;
1851 case 5:
1852 if (xor) {
1853 while (d64 < (uint64_t *) rd.d_top) {
1854 ta = *s64;
1855 prod = ta;
1856 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1857 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1858 *d64 ^= (ta ^ prod);
1859 d64++;
1860 s64++;
1861 }
1862 } else {
1863 while (d64 < (uint64_t *) rd.d_top) {
1864 ta = *s64;
1865 prod = ta;
1866 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1867 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1868 *d64 = ta ^ prod;
1869 d64++;
1870 s64++;
1871 }
1872 }
1873 break;
1874 case 6:
1875 if (xor) {
1876 while (d64 < (uint64_t *) rd.d_top) {
1877 ta = *s64;
1878 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1879 prod = ta;
1880 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1881 *d64 ^= (ta ^ prod);
1882 d64++;
1883 s64++;
1884 }
1885 } else {
1886 while (d64 < (uint64_t *) rd.d_top) {
1887 ta = *s64;
1888 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1889 prod = ta;
1890 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1891 *d64 = ta ^ prod;
1892 d64++;
1893 s64++;
1894 }
1895 }
1896 break;
1897 /*
1898 case 7:
1899 if (xor) {
1900 while (d64 < (uint64_t *) rd.d_top) {
1901 ta = *s64;
1902 prod = ta;
1903 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1904 prod ^= ta;
1905 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1906 *d64 ^= (ta ^ prod);
1907 d64++;
1908 s64++;
1909 }
1910 } else {
1911 while (d64 < (uint64_t *) rd.d_top) {
1912 ta = *s64;
1913 prod = ta;
1914 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1915 prod ^= ta;
1916 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1917 *d64 = ta ^ prod;
1918 d64++;
1919 s64++;
1920 }
1921 }
1922 break;
1923 */
1924 case 8:
1925 if (xor) {
1926 while (d64 < (uint64_t *) rd.d_top) {
1927 ta = *s64;
1928 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1929 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1930 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1931 *d64 ^= ta;
1932 d64++;
1933 s64++;
1934 }
1935 } else {
1936 while (d64 < (uint64_t *) rd.d_top) {
1937 ta = *s64;
1938 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1939 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1940 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1941 *d64 = ta;
1942 d64++;
1943 s64++;
1944 }
1945 }
1946 break;
1947 /*
1948 case 9:
1949 if (xor) {
1950 while (d64 < (uint64_t *) rd.d_top) {
1951 ta = *s64;
1952 prod = ta;
1953 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1954 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1955 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1956 *d64 ^= (ta ^ prod);
1957 d64++;
1958 s64++;
1959 }
1960 } else {
1961 while (d64 < (uint64_t *) rd.d_top) {
1962 ta = *s64;
1963 prod = ta;
1964 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1965 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1966 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1967 *d64 = (ta ^ prod);
1968 d64++;
1969 s64++;
1970 }
1971 }
1972 break;
1973 case 10:
1974 if (xor) {
1975 while (d64 < (uint64_t *) rd.d_top) {
1976 ta = *s64;
1977 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1978 prod = ta;
1979 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1980 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1981 *d64 ^= (ta ^ prod);
1982 d64++;
1983 s64++;
1984 }
1985 } else {
1986 while (d64 < (uint64_t *) rd.d_top) {
1987 ta = *s64;
1988 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1989 prod = ta;
1990 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1991 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1992 *d64 = (ta ^ prod);
1993 d64++;
1994 s64++;
1995 }
1996 }
1997 break;
1998 case 11:
1999 if (xor) {
2000 while (d64 < (uint64_t *) rd.d_top) {
2001 ta = *s64;
2002 prod = ta;
2003 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2004 prod ^= ta;
2005 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2006 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2007 *d64 ^= (ta ^ prod);
2008 d64++;
2009 s64++;
2010 }
2011 } else {
2012 while (d64 < (uint64_t *) rd.d_top) {
2013 ta = *s64;
2014 prod = ta;
2015 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2016 prod ^= ta;
2017 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2018 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2019 *d64 = (ta ^ prod);
2020 d64++;
2021 s64++;
2022 }
2023 }
2024 break;
2025 case 12:
2026 if (xor) {
2027 while (d64 < (uint64_t *) rd.d_top) {
2028 ta = *s64;
2029 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2030 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2031 prod = ta;
2032 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2033 *d64 ^= (ta ^ prod);
2034 d64++;
2035 s64++;
2036 }
2037 } else {
2038 while (d64 < (uint64_t *) rd.d_top) {
2039 ta = *s64;
2040 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2041 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2042 prod = ta;
2043 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2044 *d64 = (ta ^ prod);
2045 d64++;
2046 s64++;
2047 }
2048 }
2049 break;
2050 case 13:
2051 if (xor) {
2052 while (d64 < (uint64_t *) rd.d_top) {
2053 ta = *s64;
2054 prod = ta;
2055 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2056 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2057 prod ^= ta;
2058 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2059 *d64 ^= (ta ^ prod);
2060 d64++;
2061 s64++;
2062 }
2063 } else {
2064 while (d64 < (uint64_t *) rd.d_top) {
2065 ta = *s64;
2066 prod = ta;
2067 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2068 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2069 prod ^= ta;
2070 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2071 *d64 = (ta ^ prod);
2072 d64++;
2073 s64++;
2074 }
2075 }
2076 break;
2077 case 14:
2078 if (xor) {
2079 while (d64 < (uint64_t *) rd.d_top) {
2080 ta = *s64;
2081 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2082 prod = ta;
2083 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2084 prod ^= ta;
2085 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2086 *d64 ^= (ta ^ prod);
2087 d64++;
2088 s64++;
2089 }
2090 } else {
2091 while (d64 < (uint64_t *) rd.d_top) {
2092 ta = *s64;
2093 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2094 prod = ta;
2095 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2096 prod ^= ta;
2097 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2098 *d64 = (ta ^ prod);
2099 d64++;
2100 s64++;
2101 }
2102 }
2103 break;
2104 case 15:
2105 if (xor) {
2106 while (d64 < (uint64_t *) rd.d_top) {
2107 ta = *s64;
2108 prod = ta;
2109 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2110 prod ^= ta;
2111 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2112 prod ^= ta;
2113 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2114 *d64 ^= (ta ^ prod);
2115 d64++;
2116 s64++;
2117 }
2118 } else {
2119 while (d64 < (uint64_t *) rd.d_top) {
2120 ta = *s64;
2121 prod = ta;
2122 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2123 prod ^= ta;
2124 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2125 prod ^= ta;
2126 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2127 *d64 = (ta ^ prod);
2128 d64++;
2129 s64++;
2130 }
2131 }
2132 break;
2133 */
2134 default:
2135 if (xor) {
2136 while (d64 < (uint64_t *) rd.d_top) {
2137 prod = *d64 ;
2138 ta = *s64;
2139 tb = val;
2140 while (1) {
2141 if (tb & 1) prod ^= ta;
2142 tb >>= 1;
2143 if (tb == 0) break;
2144 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2145 }
2146 *d64 = prod;
2147 d64++;
2148 s64++;
2149 }
2150 } else {
2151 while (d64 < (uint64_t *) rd.d_top) {
2152 prod = 0 ;
2153 ta = *s64;
2154 tb = val;
2155 while (1) {
2156 if (tb & 1) prod ^= ta;
2157 tb >>= 1;
2158 if (tb == 0) break;
2159 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
2160 }
2161 *d64 = prod;
2162 d64++;
2163 s64++;
2164 }
2165 }
2166 break;
2167 }
2168 gf_do_final_region_alignment(&rd);
2169 }
2170
2171 static
2172 int gf_w8_bytwo_init(gf_t *gf)
2173 {
2174 gf_internal_t *h;
2175 uint64_t ip, m1, m2;
2176 struct gf_w8_bytwo_data *btd;
2177
2178 h = (gf_internal_t *) gf->scratch;
2179 btd = (struct gf_w8_bytwo_data *) (h->private);
2180 ip = h->prim_poly & 0xff;
2181 m1 = 0xfe;
2182 m2 = 0x80;
2183 btd->prim_poly = 0;
2184 btd->mask1 = 0;
2185 btd->mask2 = 0;
2186
2187 while (ip != 0) {
2188 btd->prim_poly |= ip;
2189 btd->mask1 |= m1;
2190 btd->mask2 |= m2;
2191 ip <<= GF_FIELD_WIDTH;
2192 m1 <<= GF_FIELD_WIDTH;
2193 m2 <<= GF_FIELD_WIDTH;
2194 }
2195
2196 if (h->mult_type == GF_MULT_BYTWO_p) {
2197 SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_p_multiply)
2198 #ifdef INTEL_SSE2
2199 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2200 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_sse_multiply_region)
2201 } else {
2202 #endif
2203 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_p_nosse_multiply_region)
2204 if(h->region_type & GF_REGION_SIMD)
2205 return 0;
2206 #ifdef INTEL_SSE2
2207 }
2208 #endif
2209 } else {
2210 SET_FUNCTION(gf,multiply,w32,gf_w8_bytwo_b_multiply)
2211 #ifdef INTEL_SSE2
2212 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
2213 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_sse_multiply_region)
2214 } else {
2215 #endif
2216 SET_FUNCTION(gf,multiply_region,w32,gf_w8_bytwo_b_nosse_multiply_region)
2217 if(h->region_type & GF_REGION_SIMD)
2218 return 0;
2219 #ifdef INTEL_SSE2
2220 }
2221 #endif
2222 }
2223 return 1;
2224 }
2225
2226
2227 /* ------------------------------------------------------------
2228 General procedures.
2229 You don't need to error check here on in init, because it's done
2230 for you in gf_error_check().
2231 */
2232
2233 int gf_w8_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2234 {
2235 switch(mult_type)
2236 {
2237 case GF_MULT_DEFAULT:
2238 if (gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) {
2239 return sizeof(gf_internal_t) + sizeof(struct gf_w8_default_data) + 64;
2240 }
2241 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2242 case GF_MULT_TABLE:
2243 if (region_type == GF_REGION_CAUCHY) {
2244 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2245 }
2246
2247 if (region_type == GF_REGION_DEFAULT) {
2248 return sizeof(gf_internal_t) + sizeof(struct gf_w8_single_table_data) + 64;
2249 }
2250 if (region_type & GF_REGION_DOUBLE_TABLE) {
2251 if (region_type == GF_REGION_DOUBLE_TABLE) {
2252 return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_data) + 64;
2253 } else if (region_type == (GF_REGION_DOUBLE_TABLE | GF_REGION_LAZY)) {
2254 return sizeof(gf_internal_t) + sizeof(struct gf_w8_double_table_lazy_data) + 64;
2255 } else {
2256 return 0;
2257 }
2258 }
2259 return 0;
2260 break;
2261 case GF_MULT_BYTWO_p:
2262 case GF_MULT_BYTWO_b:
2263 return sizeof(gf_internal_t) + sizeof(struct gf_w8_bytwo_data);
2264 break;
2265 case GF_MULT_SPLIT_TABLE:
2266 if ((arg1 == 4 && arg2 == 8) || (arg1 == 8 && arg2 == 4)) {
2267 return sizeof(gf_internal_t) + sizeof(struct gf_w8_half_table_data) + 64;
2268 }
2269 break;
2270 case GF_MULT_LOG_TABLE:
2271 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logtable_data) + 64;
2272 break;
2273 case GF_MULT_LOG_ZERO:
2274 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_small_table_data) + 64;
2275 break;
2276 case GF_MULT_LOG_ZERO_EXT:
2277 return sizeof(gf_internal_t) + sizeof(struct gf_w8_logzero_table_data) + 64;
2278 break;
2279 case GF_MULT_CARRY_FREE:
2280 return sizeof(gf_internal_t);
2281 break;
2282 case GF_MULT_SHIFT:
2283 return sizeof(gf_internal_t);
2284 break;
2285 case GF_MULT_COMPOSITE:
2286 return sizeof(gf_internal_t) + sizeof(struct gf_w8_composite_data) + 64;
2287 default:
2288 return 0;
2289 }
2290 return 0;
2291 }
2292
2293 int gf_w8_init(gf_t *gf)
2294 {
2295 gf_internal_t *h;
2296
2297 h = (gf_internal_t *) gf->scratch;
2298
2299 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2300
2301 if (h->prim_poly == 0) {
2302 if (h->mult_type == GF_MULT_COMPOSITE) {
2303 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2304 if (h->prim_poly == 0) return 0; /* JSP: This shouldn't happen, but just in case. */
2305 } else {
2306 h->prim_poly = 0x11d;
2307 }
2308 }
2309 if (h->mult_type != GF_MULT_COMPOSITE) {
2310 h->prim_poly |= 0x100;
2311 }
2312
2313 SET_FUNCTION(gf,multiply,w32,NULL)
2314 SET_FUNCTION(gf,divide,w32,NULL)
2315 SET_FUNCTION(gf,inverse,w32,NULL)
2316 SET_FUNCTION(gf,multiply_region,w32,NULL)
2317 SET_FUNCTION(gf,extract_word,w32,gf_w8_extract_word)
2318
2319 switch(h->mult_type) {
2320 case GF_MULT_DEFAULT:
2321 case GF_MULT_TABLE: if (gf_w8_table_init(gf) == 0) return 0; break;
2322 case GF_MULT_BYTWO_p:
2323 case GF_MULT_BYTWO_b: if (gf_w8_bytwo_init(gf) == 0) return 0; break;
2324 case GF_MULT_LOG_ZERO:
2325 case GF_MULT_LOG_ZERO_EXT:
2326 case GF_MULT_LOG_TABLE: if (gf_w8_log_init(gf) == 0) return 0; break;
2327 case GF_MULT_CARRY_FREE: if (gf_w8_cfm_init(gf) == 0) return 0; break;
2328 case GF_MULT_SHIFT: if (gf_w8_shift_init(gf) == 0) return 0; break;
2329 case GF_MULT_SPLIT_TABLE: if (gf_w8_split_init(gf) == 0) return 0; break;
2330 case GF_MULT_COMPOSITE: if (gf_w8_composite_init(gf) == 0) return 0; break;
2331 default: return 0;
2332 }
2333
2334 if (h->divide_type == GF_DIVIDE_EUCLID) {
2335 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2336 SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2337 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2338 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2339 SET_FUNCTION(gf,inverse,w32,gf_w8_matrix)
2340 }
2341
2342 if (gf->divide.w32 == NULL) {
2343 SET_FUNCTION(gf,divide,w32,gf_w8_divide_from_inverse)
2344 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_euclid)
2345 }
2346
2347 if (gf->inverse.w32 == NULL) SET_FUNCTION(gf,inverse,w32,gf_w8_inverse_from_divide)
2348
2349 if (h->mult_type == GF_MULT_COMPOSITE && (h->region_type & GF_REGION_ALTMAP)) {
2350 SET_FUNCTION(gf,extract_word,w32,gf_w8_composite_extract_word)
2351 }
2352
2353 if (h->region_type == GF_REGION_CAUCHY) {
2354 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2355 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2356 }
2357
2358 if (gf->multiply_region.w32 == NULL) {
2359 SET_FUNCTION(gf,multiply_region,w32,gf_w8_multiply_region_from_single)
2360 }
2361
2362 return 1;
2363 }
2364
2365
2366 /* Inline setup functions */
2367
2368 uint8_t *gf_w8_get_mult_table(gf_t *gf)
2369 {
2370 gf_internal_t *h;
2371 struct gf_w8_default_data *ftd;
2372 struct gf_w8_single_table_data *std;
2373
2374 h = (gf_internal_t *) gf->scratch;
2375 if (gf->multiply.w32 == gf_w8_default_multiply) {
2376 ftd = (struct gf_w8_default_data *) h->private;
2377 return (uint8_t *) ftd->multtable;
2378 } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2379 std = (struct gf_w8_single_table_data *) h->private;
2380 return (uint8_t *) std->multtable;
2381 }
2382 return NULL;
2383 }
2384
2385 uint8_t *gf_w8_get_div_table(gf_t *gf)
2386 {
2387 struct gf_w8_default_data *ftd;
2388 struct gf_w8_single_table_data *std;
2389
2390 if (gf->multiply.w32 == gf_w8_default_multiply) {
2391 ftd = (struct gf_w8_default_data *) ((gf_internal_t *) gf->scratch)->private;
2392 return (uint8_t *) ftd->divtable;
2393 } else if (gf->multiply.w32 == gf_w8_table_multiply) {
2394 std = (struct gf_w8_single_table_data *) ((gf_internal_t *) gf->scratch)->private;
2395 return (uint8_t *) std->divtable;
2396 }
2397 return NULL;
2398 }