]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/jerasure/gf-complete/src/gf_w32.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / erasure-code / jerasure / gf-complete / src / gf_w32.c
1 /*
2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5 *
6 * gf_w32.c
7 *
8 * Routines for 32-bit Galois fields
9 */
10
11
12 #include "gf_int.h"
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include "gf_w32.h"
16 #include "gf_cpu.h"
17
18 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
19
20 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
21
22 #define AB2(ip, am1 ,am2, b, t1, t2) {\
23 t1 = (b << 1) & am1;\
24 t2 = b & am2; \
25 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
26 b = (t1 ^ (t2 & ip));}
27
28 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
29 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
30 t2 = _mm_and_si128(va, m2); \
31 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
32 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
33
34 static
35 inline
36 uint32_t gf_w32_inverse_from_divide (gf_t *gf, uint32_t a)
37 {
38 return gf->divide.w32(gf, 1, a);
39 }
40
41 static
42 inline
43 uint32_t gf_w32_divide_from_inverse (gf_t *gf, uint32_t a, uint32_t b)
44 {
45 b = gf->inverse.w32(gf, b);
46 return gf->multiply.w32(gf, a, b);
47 }
48
49 static
50 void
51 gf_w32_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int
52 xor)
53 {
54 uint32_t i;
55 uint32_t *s32;
56 uint32_t *d32;
57
58 s32 = (uint32_t *) src;
59 d32 = (uint32_t *) dest;
60
61 if (xor) {
62 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
63 d32[i] ^= gf->multiply.w32(gf, val, s32[i]);
64 }
65 } else {
66 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
67 d32[i] = gf->multiply.w32(gf, val, s32[i]);
68 }
69 }
70 }
71
72 #if defined(INTEL_SSE4_PCLMUL)
73
74 static
75 void
76 gf_w32_clm_multiply_region_from_single_2(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
77 {
78
79 uint32_t i;
80 uint32_t *s32;
81 uint32_t *d32;
82
83 __m128i a, b;
84 __m128i result;
85 __m128i prim_poly;
86 __m128i w;
87 gf_internal_t * h = gf->scratch;
88
89 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
90
91 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
92 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
93
94 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
95 s32 = (uint32_t *) src;
96 d32 = (uint32_t *) dest;
97
98 if (xor) {
99 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
100 b = _mm_insert_epi32 (a, s32[i], 0);
101 result = _mm_clmulepi64_si128 (a, b, 0);
102 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
103 result = _mm_xor_si128 (result, w);
104 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
105 result = _mm_xor_si128 (result, w);
106 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
107 }
108 } else {
109 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
110 b = _mm_insert_epi32 (a, s32[i], 0);
111 result = _mm_clmulepi64_si128 (a, b, 0);
112 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
113 result = _mm_xor_si128 (result, w);
114 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
115 result = _mm_xor_si128 (result, w);
116 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
117 }
118 }
119 }
120 #endif
121
122 #if defined(INTEL_SSE4_PCLMUL)
123
124 static
125 void
126 gf_w32_clm_multiply_region_from_single_3(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
127 {
128
129 uint32_t i;
130 uint32_t *s32;
131 uint32_t *d32;
132
133 __m128i a, b;
134 __m128i result;
135 __m128i prim_poly;
136 __m128i w;
137 gf_internal_t * h = gf->scratch;
138
139 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
140
141 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
142 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
143
144 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
145
146 s32 = (uint32_t *) src;
147 d32 = (uint32_t *) dest;
148
149 if (xor) {
150 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
151 b = _mm_insert_epi32 (a, s32[i], 0);
152 result = _mm_clmulepi64_si128 (a, b, 0);
153 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
154 result = _mm_xor_si128 (result, w);
155 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
156 result = _mm_xor_si128 (result, w);
157 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
158 result = _mm_xor_si128 (result, w);
159 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
160 }
161 } else {
162 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
163 b = _mm_insert_epi32 (a, s32[i], 0);
164 result = _mm_clmulepi64_si128 (a, b, 0);
165 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
166 result = _mm_xor_si128 (result, w);
167 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
168 result = _mm_xor_si128 (result, w);
169 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
170 result = _mm_xor_si128 (result, w);
171 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
172 }
173 }
174 }
175 #endif
176
177 #if defined(INTEL_SSE4_PCLMUL)
178 static
179 void
180 gf_w32_clm_multiply_region_from_single_4(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
181 {
182 uint32_t i;
183 uint32_t *s32;
184 uint32_t *d32;
185
186 __m128i a, b;
187 __m128i result;
188 __m128i prim_poly;
189 __m128i w;
190 gf_internal_t * h = gf->scratch;
191
192 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
193
194 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
195 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
196
197 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
198
199 s32 = (uint32_t *) src;
200 d32 = (uint32_t *) dest;
201
202 if (xor) {
203 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
204 b = _mm_insert_epi32 (a, s32[i], 0);
205 result = _mm_clmulepi64_si128 (a, b, 0);
206 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
207 result = _mm_xor_si128 (result, w);
208 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
209 result = _mm_xor_si128 (result, w);
210 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
211 result = _mm_xor_si128 (result, w);
212 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
213 result = _mm_xor_si128 (result, w);
214 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
215 }
216 } else {
217 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
218 b = _mm_insert_epi32 (a, s32[i], 0);
219 result = _mm_clmulepi64_si128 (a, b, 0);
220 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
221 result = _mm_xor_si128 (result, w);
222 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
223 result = _mm_xor_si128 (result, w);
224 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
225 result = _mm_xor_si128 (result, w);
226 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
227 result = _mm_xor_si128 (result, w);
228 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
229 }
230 }
231 }
232 #endif
233
234 static
235 inline
236 uint32_t gf_w32_euclid (gf_t *gf, uint32_t b)
237 {
238 uint32_t e_i, e_im1, e_ip1;
239 uint32_t d_i, d_im1, d_ip1;
240 uint32_t y_i, y_im1, y_ip1;
241 uint32_t c_i;
242
243 if (b == 0) return -1;
244 e_im1 = ((gf_internal_t *) (gf->scratch))->prim_poly;
245 e_i = b;
246 d_im1 = 32;
247 for (d_i = d_im1-1; ((1 << d_i) & e_i) == 0; d_i--) ;
248 y_i = 1;
249 y_im1 = 0;
250
251 while (e_i != 1) {
252
253 e_ip1 = e_im1;
254 d_ip1 = d_im1;
255 c_i = 0;
256
257 while (d_ip1 >= d_i) {
258 c_i ^= (1 << (d_ip1 - d_i));
259 e_ip1 ^= (e_i << (d_ip1 - d_i));
260 d_ip1--;
261 if (e_ip1 == 0) return 0;
262 while ((e_ip1 & (1 << d_ip1)) == 0) d_ip1--;
263 }
264
265 y_ip1 = y_im1 ^ gf->multiply.w32(gf, c_i, y_i);
266 y_im1 = y_i;
267 y_i = y_ip1;
268
269 e_im1 = e_i;
270 d_im1 = d_i;
271 e_i = e_ip1;
272 d_i = d_ip1;
273 }
274
275 return y_i;
276 }
277
278 static
279 gf_val_32_t gf_w32_extract_word(gf_t *gf, void *start, int bytes, int index)
280 {
281 uint32_t *r32, rv;
282
283 r32 = (uint32_t *) start;
284 rv = r32[index];
285 return rv;
286 }
287
288 static
289 gf_val_32_t gf_w32_composite_extract_word(gf_t *gf, void *start, int bytes, int index)
290 {
291 int sub_size;
292 gf_internal_t *h;
293 uint8_t *r8, *top;
294 uint32_t a, b, *r32;
295 gf_region_data rd;
296
297 h = (gf_internal_t *) gf->scratch;
298 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 32);
299 r32 = (uint32_t *) start;
300 if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
301 if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
302 index -= (((uint32_t *) rd.d_start) - r32);
303 r8 = (uint8_t *) rd.d_start;
304 top = (uint8_t *) rd.d_top;
305 sub_size = (top-r8)/2;
306
307 a = h->base_gf->extract_word.w32(h->base_gf, r8, sub_size, index);
308 b = h->base_gf->extract_word.w32(h->base_gf, r8+sub_size, sub_size, index);
309 return (a | (b << 16));
310 }
311
312 static
313 gf_val_32_t gf_w32_split_extract_word(gf_t *gf, void *start, int bytes, int index)
314 {
315 int i;
316 uint32_t *r32, rv;
317 uint8_t *r8;
318 gf_region_data rd;
319
320 gf_set_region_data(&rd, gf, start, start, bytes, 0, 0, 64);
321 r32 = (uint32_t *) start;
322 if (r32 + index < (uint32_t *) rd.d_start) return r32[index];
323 if (r32 + index >= (uint32_t *) rd.d_top) return r32[index];
324 index -= (((uint32_t *) rd.d_start) - r32);
325 r8 = (uint8_t *) rd.d_start;
326 r8 += ((index & 0xfffffff0)*4);
327 r8 += (index & 0xf);
328 r8 += 48;
329 rv =0;
330 for (i = 0; i < 4; i++) {
331 rv <<= 8;
332 rv |= *r8;
333 r8 -= 16;
334 }
335 return rv;
336 }
337
338
339 static
340 inline
341 uint32_t gf_w32_matrix (gf_t *gf, uint32_t b)
342 {
343 return gf_bitmatrix_inverse(b, 32, ((gf_internal_t *) (gf->scratch))->prim_poly);
344 }
345
346 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
347 include it for completeness. It does have the feature that it requires no
348 extra memory.
349 */
350
351 #if defined(INTEL_SSE4_PCLMUL)
352
353 static
354 inline
355 gf_val_32_t
356 gf_w32_cfmgk_multiply (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
357 {
358 gf_val_32_t rv = 0;
359
360 __m128i a, b;
361 __m128i result;
362 __m128i w;
363 __m128i g, q;
364 gf_internal_t * h = gf->scratch;
365 uint64_t g_star, q_plus;
366
367 q_plus = *(uint64_t *) h->private;
368 g_star = *((uint64_t *) h->private + 1);
369
370 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
371 b = _mm_insert_epi32 (a, b32, 0);
372 g = _mm_insert_epi64 (a, g_star, 0);
373 q = _mm_insert_epi64 (a, q_plus, 0);
374
375 result = _mm_clmulepi64_si128 (a, b, 0);
376 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
377 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
378 result = _mm_xor_si128 (result, w);
379
380 /* Extracts 32 bit value from result. */
381 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
382 return rv;
383 }
384 #endif
385
386 #if defined(INTEL_SSE4_PCLMUL)
387
388 static
389 void
390 gf_w32_cfmgk_multiply_region_from_single(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
391 {
392
393 uint32_t i;
394 uint32_t *s32;
395 uint32_t *d32;
396
397 __m128i a, b;
398 __m128i result;
399 __m128i w;
400 __m128i g, q;
401 gf_internal_t * h = gf->scratch;
402 uint64_t g_star, q_plus;
403
404 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
405 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
406
407 q_plus = *(uint64_t *) h->private;
408 g_star = *((uint64_t *) h->private + 1);
409
410 a = _mm_insert_epi32 (_mm_setzero_si128(), val, 0);
411 g = _mm_insert_epi64 (a, g_star, 0);
412 q = _mm_insert_epi64 (a, q_plus, 0);
413 s32 = (uint32_t *) src;
414 d32 = (uint32_t *) dest;
415
416 if (xor) {
417 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
418 b = _mm_insert_epi32 (a, s32[i], 0);
419 result = _mm_clmulepi64_si128 (a, b, 0);
420 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
421 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
422 result = _mm_xor_si128 (result, w);
423 d32[i] ^= ((gf_val_32_t)_mm_extract_epi32(result, 0));
424 }
425 } else {
426 for (i = 0; i < bytes/sizeof(uint32_t); i++) {
427 b = _mm_insert_epi32 (a, s32[i], 0);
428 result = _mm_clmulepi64_si128 (a, b, 0);
429 w = _mm_clmulepi64_si128 (q, _mm_srli_si128 (result, 4), 0);
430 w = _mm_clmulepi64_si128 (g, _mm_srli_si128 (w, 4), 0);
431 result = _mm_xor_si128 (result, w);
432 d32[i] = ((gf_val_32_t)_mm_extract_epi32(result, 0));
433 }
434 }
435 }
436 #endif
437
438
439 #if defined(INTEL_SSE4_PCLMUL)
440
441 static
442 inline
443 gf_val_32_t
444 gf_w32_clm_multiply_2 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
445 {
446 gf_val_32_t rv = 0;
447
448 __m128i a, b;
449 __m128i result;
450 __m128i prim_poly;
451 __m128i w;
452 gf_internal_t * h = gf->scratch;
453
454
455 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
456 b = _mm_insert_epi32 (a, b32, 0);
457
458 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
459
460 /* Do the initial multiply */
461
462 result = _mm_clmulepi64_si128 (a, b, 0);
463
464 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
465 have to do the reduction at most twice, because (w-2)/z == 2. Where
466 z is equal to the number of zeros after the leading 1
467
468 _mm_clmulepi64_si128 is the carryless multiply operation. Here
469 _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
470 us to multiply the prim_poly by the leading bits of the result. We
471 then xor the result of that operation back with the result.*/
472
473 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
474 result = _mm_xor_si128 (result, w);
475 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
476 result = _mm_xor_si128 (result, w);
477
478 /* Extracts 32 bit value from result. */
479 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
480 return rv;
481 }
482 #endif
483
484 #if defined(INTEL_SSE4_PCLMUL)
485
486 static
487 inline
488 gf_val_32_t
489 gf_w32_clm_multiply_3 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
490 {
491 gf_val_32_t rv = 0;
492
493 __m128i a, b;
494 __m128i result;
495 __m128i prim_poly;
496 __m128i w;
497 gf_internal_t * h = gf->scratch;
498
499
500 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
501 b = _mm_insert_epi32 (a, b32, 0);
502
503 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
504
505 /* Do the initial multiply */
506
507 result = _mm_clmulepi64_si128 (a, b, 0);
508
509 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
510 result = _mm_xor_si128 (result, w);
511 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
512 result = _mm_xor_si128 (result, w);
513 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
514 result = _mm_xor_si128 (result, w);
515
516 /* Extracts 32 bit value from result. */
517
518 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
519 return rv;
520 }
521 #endif
522
523 #if defined(INTEL_SSE4_PCLMUL)
524
525 static
526 inline
527 gf_val_32_t
528 gf_w32_clm_multiply_4 (gf_t *gf, gf_val_32_t a32, gf_val_32_t b32)
529 {
530 gf_val_32_t rv = 0;
531
532 __m128i a, b;
533 __m128i result;
534 __m128i prim_poly;
535 __m128i w;
536 gf_internal_t * h = gf->scratch;
537
538
539 a = _mm_insert_epi32 (_mm_setzero_si128(), a32, 0);
540 b = _mm_insert_epi32 (a, b32, 0);
541
542 prim_poly = _mm_set_epi32(0, 0, 1, (uint32_t)(h->prim_poly & 0xffffffffULL));
543
544 /* Do the initial multiply */
545
546 result = _mm_clmulepi64_si128 (a, b, 0);
547
548 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
549 result = _mm_xor_si128 (result, w);
550 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
551 result = _mm_xor_si128 (result, w);
552 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
553 result = _mm_xor_si128 (result, w);
554 w = _mm_clmulepi64_si128 (prim_poly, _mm_srli_si128 (result, 4), 0);
555 result = _mm_xor_si128 (result, w);
556
557 /* Extracts 32 bit value from result. */
558
559 rv = ((gf_val_32_t)_mm_extract_epi32(result, 0));
560 return rv;
561 }
562 #endif
563
564
565 static
566 inline
567 uint32_t
568 gf_w32_shift_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
569 {
570 uint64_t product, i, pp, a, b, one;
571 gf_internal_t *h;
572
573 a = a32;
574 b = b32;
575 h = (gf_internal_t *) gf->scratch;
576 one = 1;
577 pp = h->prim_poly | (one << 32);
578
579 product = 0;
580
581 for (i = 0; i < GF_FIELD_WIDTH; i++) {
582 if (a & (one << i)) product ^= (b << i);
583 }
584 for (i = (GF_FIELD_WIDTH*2-2); i >= GF_FIELD_WIDTH; i--) {
585 if (product & (one << i)) product ^= (pp << (i-GF_FIELD_WIDTH));
586 }
587 return product;
588 }
589
590 static
591 int gf_w32_cfmgk_init(gf_t *gf)
592 {
593 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
594 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
595
596 #if defined(INTEL_SSE4_PCLMUL)
597 if (gf_cpu_supports_intel_pclmul) {
598 gf_internal_t *h;
599
600 h = (gf_internal_t *) gf->scratch;
601 SET_FUNCTION(gf,multiply,w32,gf_w32_cfmgk_multiply)
602 SET_FUNCTION(gf,multiply_region,w32,gf_w32_cfmgk_multiply_region_from_single)
603
604 uint64_t *q_plus = (uint64_t *) h->private;
605 uint64_t *g_star = (uint64_t *) h->private + 1;
606
607 uint64_t tmp = h->prim_poly << 32;
608 *q_plus = 1ULL << 32;
609
610 int i;
611 for(i = 63; i >= 32; i--)
612 if((1ULL << i) & tmp)
613 {
614 *q_plus |= 1ULL << (i-32);
615 tmp ^= h->prim_poly << (i-32);
616 }
617
618 *g_star = h->prim_poly & ((1ULL << 32) - 1);
619
620 return 1;
621 }
622 #endif
623
624 return 0;
625 }
626
627 static
628 int gf_w32_cfm_init(gf_t *gf)
629 {
630 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
631 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
632
633 /*Ben: We also check to see if the prim poly will work for pclmul */
634 /*Ben: Check to see how many reduction steps it will take*/
635
636 #if defined(INTEL_SSE4_PCLMUL)
637 if (gf_cpu_supports_intel_pclmul) {
638 gf_internal_t *h;
639
640 h = (gf_internal_t *) gf->scratch;
641
642 if ((0xfffe0000 & h->prim_poly) == 0){
643 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
644 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_2)
645 }else if ((0xffc00000 & h->prim_poly) == 0){
646 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
647 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_3)
648 }else if ((0xfe000000 & h->prim_poly) == 0){
649 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
650 SET_FUNCTION(gf,multiply_region,w32,gf_w32_clm_multiply_region_from_single_4)
651 } else {
652 return 0;
653 }
654 return 1;
655 }
656 #endif
657
658 return 0;
659 }
660
661 static
662 int gf_w32_shift_init(gf_t *gf)
663 {
664 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
665 SET_FUNCTION(gf,multiply_region,w32,gf_w32_multiply_region_from_single)
666 SET_FUNCTION(gf,multiply,w32,gf_w32_shift_multiply)
667 return 1;
668 }
669
670 static
671 void
672 gf_w32_group_set_shift_tables(uint32_t *shift, uint32_t val, gf_internal_t *h)
673 {
674 uint32_t i;
675 uint32_t j;
676
677 shift[0] = 0;
678
679 for (i = 1; i < ((uint32_t)1 << h->arg1); i <<= 1) {
680 for (j = 0; j < i; j++) shift[i|j] = shift[j]^val;
681 if (val & GF_FIRST_BIT) {
682 val <<= 1;
683 val ^= h->prim_poly;
684 } else {
685 val <<= 1;
686 }
687 }
688 }
689
690 static
691 void gf_w32_group_s_equals_r_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
692 {
693 int leftover, rs;
694 uint32_t p, l, ind, a32;
695 int bits_left;
696 int g_s;
697 gf_region_data rd;
698 uint32_t *s32, *d32, *top;
699 struct gf_w32_group_data *gd;
700 gf_internal_t *h = (gf_internal_t *) gf->scratch;
701
702 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
703 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
704
705 gd = (struct gf_w32_group_data *) h->private;
706 g_s = h->arg1;
707 gf_w32_group_set_shift_tables(gd->shift, val, h);
708
709 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
710 gf_do_initial_region_alignment(&rd);
711
712 s32 = (uint32_t *) rd.s_start;
713 d32 = (uint32_t *) rd.d_start;
714 top = (uint32_t *) rd.d_top;
715
716 leftover = 32 % g_s;
717 if (leftover == 0) leftover = g_s;
718
719 while (d32 < top) {
720 rs = 32 - leftover;
721 a32 = *s32;
722 ind = a32 >> rs;
723 a32 <<= leftover;
724 p = gd->shift[ind];
725
726 bits_left = rs;
727 rs = 32 - g_s;
728
729 while (bits_left > 0) {
730 bits_left -= g_s;
731 ind = a32 >> rs;
732 a32 <<= g_s;
733 l = p >> rs;
734 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
735 }
736 if (xor) p ^= *d32;
737 *d32 = p;
738 d32++;
739 s32++;
740 }
741 gf_do_final_region_alignment(&rd);
742 }
743
744 static
745 void gf_w32_group_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
746 {
747 uint32_t *s32, *d32, *top;
748 int i;
749 int leftover;
750 uint64_t p, l, r;
751 uint32_t a32, ind;
752 int g_s, g_r;
753 struct gf_w32_group_data *gd;
754 gf_region_data rd;
755
756 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
757 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
758
759 gf_internal_t *h = (gf_internal_t *) gf->scratch;
760 g_s = h->arg1;
761 g_r = h->arg2;
762 gd = (struct gf_w32_group_data *) h->private;
763 gf_w32_group_set_shift_tables(gd->shift, val, h);
764
765 leftover = GF_FIELD_WIDTH % g_s;
766 if (leftover == 0) leftover = g_s;
767
768 gd = (struct gf_w32_group_data *) h->private;
769 gf_w32_group_set_shift_tables(gd->shift, val, h);
770
771 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
772 gf_do_initial_region_alignment(&rd);
773
774 s32 = (uint32_t *) rd.s_start;
775 d32 = (uint32_t *) rd.d_start;
776 top = (uint32_t *) rd.d_top;
777
778 while (d32 < top) {
779 a32 = *s32;
780 ind = a32 >> (GF_FIELD_WIDTH - leftover);
781 p = gd->shift[ind];
782 p <<= g_s;
783 a32 <<= leftover;
784
785 i = (GF_FIELD_WIDTH - leftover);
786 while (i > g_s) {
787 ind = a32 >> (GF_FIELD_WIDTH-g_s);
788 p ^= gd->shift[ind];
789 a32 <<= g_s;
790 p <<= g_s;
791 i -= g_s;
792 }
793
794 ind = a32 >> (GF_FIELD_WIDTH-g_s);
795 p ^= gd->shift[ind];
796
797 for (i = gd->tshift ; i >= 0; i -= g_r) {
798 l = p & (gd->rmask << i);
799 r = gd->reduce[l >> (i+32)];
800 r <<= (i);
801 p ^= r;
802 }
803
804 if (xor) p ^= *d32;
805 *d32 = p;
806 d32++;
807 s32++;
808 }
809 gf_do_final_region_alignment(&rd);
810 }
811
812 static
813 inline
814 gf_val_32_t
815 gf_w32_group_s_equals_r_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
816 {
817 int leftover, rs;
818 uint32_t p, l, ind, a32;
819 int bits_left;
820 int g_s;
821
822 struct gf_w32_group_data *gd;
823 gf_internal_t *h = (gf_internal_t *) gf->scratch;
824 g_s = h->arg1;
825
826 gd = (struct gf_w32_group_data *) h->private;
827 gf_w32_group_set_shift_tables(gd->shift, b, h);
828
829 leftover = 32 % g_s;
830 if (leftover == 0) leftover = g_s;
831
832 rs = 32 - leftover;
833 a32 = a;
834 ind = a32 >> rs;
835 a32 <<= leftover;
836 p = gd->shift[ind];
837
838 bits_left = rs;
839 rs = 32 - g_s;
840
841 while (bits_left > 0) {
842 bits_left -= g_s;
843 ind = a32 >> rs;
844 a32 <<= g_s;
845 l = p >> rs;
846 p = (gd->shift[ind] ^ gd->reduce[l] ^ (p << g_s));
847 }
848 return p;
849 }
850
851 static
852 inline
853 gf_val_32_t
854 gf_w32_group_4_4_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
855 {
856 uint32_t p, l, ind, a32;
857
858 struct gf_w32_group_data *d44;
859 gf_internal_t *h = (gf_internal_t *) gf->scratch;
860
861 d44 = (struct gf_w32_group_data *) h->private;
862 gf_w32_group_set_shift_tables(d44->shift, b, h);
863
864 a32 = a;
865 ind = a32 >> 28;
866 a32 <<= 4;
867 p = d44->shift[ind];
868 ind = a32 >> 28;
869 a32 <<= 4;
870 l = p >> 28;
871 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
872 ind = a32 >> 28;
873 a32 <<= 4;
874 l = p >> 28;
875 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
876 ind = a32 >> 28;
877 a32 <<= 4;
878 l = p >> 28;
879 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
880 ind = a32 >> 28;
881 a32 <<= 4;
882 l = p >> 28;
883 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
884 ind = a32 >> 28;
885 a32 <<= 4;
886 l = p >> 28;
887 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
888 ind = a32 >> 28;
889 a32 <<= 4;
890 l = p >> 28;
891 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
892 ind = a32 >> 28;
893 l = p >> 28;
894 p = (d44->shift[ind] ^ d44->reduce[l] ^ (p << 4));
895 return p;
896 }
897
898 static
899 inline
900 gf_val_32_t
901 gf_w32_group_multiply(gf_t *gf, gf_val_32_t a, gf_val_32_t b)
902 {
903 int i;
904 int leftover;
905 uint64_t p, l, r;
906 uint32_t a32, ind;
907 int g_s, g_r;
908 struct gf_w32_group_data *gd;
909
910 gf_internal_t *h = (gf_internal_t *) gf->scratch;
911 g_s = h->arg1;
912 g_r = h->arg2;
913 gd = (struct gf_w32_group_data *) h->private;
914 gf_w32_group_set_shift_tables(gd->shift, b, h);
915
916 leftover = GF_FIELD_WIDTH % g_s;
917 if (leftover == 0) leftover = g_s;
918
919 a32 = a;
920 ind = a32 >> (GF_FIELD_WIDTH - leftover);
921 p = gd->shift[ind];
922 p <<= g_s;
923 a32 <<= leftover;
924
925 i = (GF_FIELD_WIDTH - leftover);
926 while (i > g_s) {
927 ind = a32 >> (GF_FIELD_WIDTH-g_s);
928 p ^= gd->shift[ind];
929 a32 <<= g_s;
930 p <<= g_s;
931 i -= g_s;
932 }
933
934 ind = a32 >> (GF_FIELD_WIDTH-g_s);
935 p ^= gd->shift[ind];
936
937 for (i = gd->tshift ; i >= 0; i -= g_r) {
938 l = p & (gd->rmask << i);
939 r = gd->reduce[l >> (i+32)];
940 r <<= (i);
941 p ^= r;
942 }
943 return p;
944 }
945
946 static
947 inline
948 gf_val_32_t
949 gf_w32_bytwo_b_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
950 {
951 uint32_t prod, pp, bmask;
952 gf_internal_t *h;
953
954 h = (gf_internal_t *) gf->scratch;
955 pp = h->prim_poly;
956
957 prod = 0;
958 bmask = 0x80000000;
959
960 while (1) {
961 if (a & 1) prod ^= b;
962 a >>= 1;
963 if (a == 0) return prod;
964 if (b & bmask) {
965 b = ((b << 1) ^ pp);
966 } else {
967 b <<= 1;
968 }
969 }
970 }
971
972 static
973 inline
974 gf_val_32_t
975 gf_w32_bytwo_p_multiply (gf_t *gf, gf_val_32_t a, gf_val_32_t b)
976 {
977 uint32_t prod, pp, pmask, amask;
978 gf_internal_t *h;
979
980 h = (gf_internal_t *) gf->scratch;
981 pp = h->prim_poly;
982
983
984 prod = 0;
985 pmask = 0x80000000;
986 amask = 0x80000000;
987
988 while (amask != 0) {
989 if (prod & pmask) {
990 prod = ((prod << 1) ^ pp);
991 } else {
992 prod <<= 1;
993 }
994 if (a & amask) prod ^= b;
995 amask >>= 1;
996 }
997 return prod;
998 }
999
1000 static
1001 void
1002 gf_w32_bytwo_p_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1003 {
1004 uint64_t *s64, *d64, t1, t2, ta, prod, amask;
1005 gf_region_data rd;
1006 struct gf_w32_bytwo_data *btd;
1007
1008 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1009 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1010
1011 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1012
1013 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 8);
1014 gf_do_initial_region_alignment(&rd);
1015
1016 s64 = (uint64_t *) rd.s_start;
1017 d64 = (uint64_t *) rd.d_start;
1018
1019 if (xor) {
1020 while (s64 < (uint64_t *) rd.s_top) {
1021 prod = 0;
1022 amask = 0x80000000;
1023 ta = *s64;
1024 while (amask != 0) {
1025 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1026 if (val & amask) prod ^= ta;
1027 amask >>= 1;
1028 }
1029 *d64 ^= prod;
1030 d64++;
1031 s64++;
1032 }
1033 } else {
1034 while (s64 < (uint64_t *) rd.s_top) {
1035 prod = 0;
1036 amask = 0x80000000;
1037 ta = *s64;
1038 while (amask != 0) {
1039 AB2(btd->prim_poly, btd->mask1, btd->mask2, prod, t1, t2);
1040 if (val & amask) prod ^= ta;
1041 amask >>= 1;
1042 }
1043 *d64 = prod;
1044 d64++;
1045 s64++;
1046 }
1047 }
1048 gf_do_final_region_alignment(&rd);
1049 }
1050
1051 #define BYTWO_P_ONESTEP {\
1052 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1053 t1 = _mm_and_si128(v, one); \
1054 t1 = _mm_sub_epi32(t1, one); \
1055 t1 = _mm_and_si128(t1, ta); \
1056 prod = _mm_xor_si128(prod, t1); \
1057 v = _mm_srli_epi64(v, 1); }
1058
1059 #ifdef INTEL_SSE2
1060 static
1061 void
1062 gf_w32_bytwo_p_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1063 {
1064 int i;
1065 uint8_t *s8, *d8;
1066 uint32_t vrev;
1067 __m128i pp, m1, m2, ta, prod, t1, t2, tp, one, v;
1068 struct gf_w32_bytwo_data *btd;
1069 gf_region_data rd;
1070
1071 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1072 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1073
1074 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1075
1076 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1077 gf_do_initial_region_alignment(&rd);
1078
1079 vrev = 0;
1080 for (i = 0; i < 32; i++) {
1081 vrev <<= 1;
1082 if (!(val & ((gf_val_32_t)1 << i))) vrev |= 1;
1083 }
1084
1085 s8 = (uint8_t *) rd.s_start;
1086 d8 = (uint8_t *) rd.d_start;
1087
1088 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1089 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1090 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1091 one = _mm_set1_epi32(1);
1092
1093 while (d8 < (uint8_t *) rd.d_top) {
1094 prod = _mm_setzero_si128();
1095 v = _mm_set1_epi32(vrev);
1096 ta = _mm_load_si128((__m128i *) s8);
1097 tp = (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i *) d8);
1098 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1099 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1100 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1101 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1102 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1103 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1104 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1105 BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP; BYTWO_P_ONESTEP;
1106 _mm_store_si128((__m128i *) d8, _mm_xor_si128(prod, tp));
1107 d8 += 16;
1108 s8 += 16;
1109 }
1110 gf_do_final_region_alignment(&rd);
1111 }
1112 #endif
1113
1114 static
1115 void
1116 gf_w32_bytwo_b_nosse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1117 {
1118 uint64_t *s64, *d64, t1, t2, ta, tb, prod;
1119 struct gf_w32_bytwo_data *btd;
1120 gf_region_data rd;
1121
1122 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1123 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1124
1125 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1126 gf_do_initial_region_alignment(&rd);
1127
1128 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1129 s64 = (uint64_t *) rd.s_start;
1130 d64 = (uint64_t *) rd.d_start;
1131
1132 switch (val) {
1133 case 2:
1134 if (xor) {
1135 while (d64 < (uint64_t *) rd.d_top) {
1136 ta = *s64;
1137 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1138 *d64 ^= ta;
1139 d64++;
1140 s64++;
1141 }
1142 } else {
1143 while (d64 < (uint64_t *) rd.d_top) {
1144 ta = *s64;
1145 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1146 *d64 = ta;
1147 d64++;
1148 s64++;
1149 }
1150 }
1151 break;
1152 case 3:
1153 if (xor) {
1154 while (d64 < (uint64_t *) rd.d_top) {
1155 ta = *s64;
1156 prod = ta;
1157 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1158 *d64 ^= (ta ^ prod);
1159 d64++;
1160 s64++;
1161 }
1162 } else {
1163 while (d64 < (uint64_t *) rd.d_top) {
1164 ta = *s64;
1165 prod = ta;
1166 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1167 *d64 = (ta ^ prod);
1168 d64++;
1169 s64++;
1170 }
1171 }
1172 break;
1173 case 4:
1174 if (xor) {
1175 while (d64 < (uint64_t *) rd.d_top) {
1176 ta = *s64;
1177 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1178 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1179 *d64 ^= ta;
1180 d64++;
1181 s64++;
1182 }
1183 } else {
1184 while (d64 < (uint64_t *) rd.d_top) {
1185 ta = *s64;
1186 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1187 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1188 *d64 = ta;
1189 d64++;
1190 s64++;
1191 }
1192 }
1193 break;
1194 case 5:
1195 if (xor) {
1196 while (d64 < (uint64_t *) rd.d_top) {
1197 ta = *s64;
1198 prod = ta;
1199 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1200 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1201 *d64 ^= (ta ^ prod);
1202 d64++;
1203 s64++;
1204 }
1205 } else {
1206 while (d64 < (uint64_t *) rd.d_top) {
1207 ta = *s64;
1208 prod = ta;
1209 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1210 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1211 *d64 = ta ^ prod;
1212 d64++;
1213 s64++;
1214 }
1215 }
1216 break;
1217 default:
1218 if (xor) {
1219 while (d64 < (uint64_t *) rd.d_top) {
1220 prod = *d64 ;
1221 ta = *s64;
1222 tb = val;
1223 while (1) {
1224 if (tb & 1) prod ^= ta;
1225 tb >>= 1;
1226 if (tb == 0) break;
1227 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1228 }
1229 *d64 = prod;
1230 d64++;
1231 s64++;
1232 }
1233 } else {
1234 while (d64 < (uint64_t *) rd.d_top) {
1235 prod = 0 ;
1236 ta = *s64;
1237 tb = val;
1238 while (1) {
1239 if (tb & 1) prod ^= ta;
1240 tb >>= 1;
1241 if (tb == 0) break;
1242 AB2(btd->prim_poly, btd->mask1, btd->mask2, ta, t1, t2);
1243 }
1244 *d64 = prod;
1245 d64++;
1246 s64++;
1247 }
1248 }
1249 break;
1250 }
1251 gf_do_final_region_alignment(&rd);
1252 }
1253
1254 #ifdef INTEL_SSE2
1255 static
1256 void
1257 gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
1258 {
1259 uint8_t *d8, *s8;
1260 __m128i pp, m1, m2, t1, t2, va;
1261
1262 s8 = (uint8_t *) rd->s_start;
1263 d8 = (uint8_t *) rd->d_start;
1264
1265 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1266 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1267 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1268
1269 while (d8 < (uint8_t *) rd->d_top) {
1270 va = _mm_load_si128 ((__m128i *)(s8));
1271 SSE_AB2(pp, m1, m2, va, t1, t2);
1272 _mm_store_si128((__m128i *)d8, va);
1273 d8 += 16;
1274 s8 += 16;
1275 }
1276 }
1277 #endif
1278
1279 #ifdef INTEL_SSE2
1280 static
1281 void
1282 gf_w32_bytwo_b_sse_region_2_xor(gf_region_data *rd, struct gf_w32_bytwo_data *btd)
1283 {
1284 uint8_t *d8, *s8;
1285 __m128i pp, m1, m2, t1, t2, va, vb;
1286
1287 s8 = (uint8_t *) rd->s_start;
1288 d8 = (uint8_t *) rd->d_start;
1289
1290 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1291 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1292 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1293
1294 while (d8 < (uint8_t *) rd->d_top) {
1295 va = _mm_load_si128 ((__m128i *)(s8));
1296 SSE_AB2(pp, m1, m2, va, t1, t2);
1297 vb = _mm_load_si128 ((__m128i *)(d8));
1298 vb = _mm_xor_si128(vb, va);
1299 _mm_store_si128((__m128i *)d8, vb);
1300 d8 += 16;
1301 s8 += 16;
1302 }
1303 }
1304 #endif
1305
1306
1307 #ifdef INTEL_SSE2
1308 static
1309 void
1310 gf_w32_bytwo_b_sse_multiply_region(gf_t *gf, void *src, void *dest, gf_val_32_t val, int bytes, int xor)
1311 {
1312 uint32_t itb;
1313 uint8_t *d8, *s8;
1314 __m128i pp, m1, m2, t1, t2, va, vb;
1315 struct gf_w32_bytwo_data *btd;
1316 gf_region_data rd;
1317
1318 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1319 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1320
1321 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 16);
1322 gf_do_initial_region_alignment(&rd);
1323
1324 btd = (struct gf_w32_bytwo_data *) ((gf_internal_t *) (gf->scratch))->private;
1325
1326 if (val == 2) {
1327 if (xor) {
1328 gf_w32_bytwo_b_sse_region_2_xor(&rd, btd);
1329 } else {
1330 gf_w32_bytwo_b_sse_region_2_noxor(&rd, btd);
1331 }
1332 gf_do_final_region_alignment(&rd);
1333 return;
1334 }
1335
1336 s8 = (uint8_t *) rd.s_start;
1337 d8 = (uint8_t *) rd.d_start;
1338
1339 pp = _mm_set1_epi32(btd->prim_poly&0xffffffff);
1340 m1 = _mm_set1_epi32((btd->mask1)&0xffffffff);
1341 m2 = _mm_set1_epi32((btd->mask2)&0xffffffff);
1342
1343 while (d8 < (uint8_t *) rd.d_top) {
1344 va = _mm_load_si128 ((__m128i *)(s8));
1345 vb = (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i *)(d8));
1346 itb = val;
1347 while (1) {
1348 if (itb & 1) vb = _mm_xor_si128(vb, va);
1349 itb >>= 1;
1350 if (itb == 0) break;
1351 SSE_AB2(pp, m1, m2, va, t1, t2);
1352 }
1353 _mm_store_si128((__m128i *)d8, vb);
1354 d8 += 16;
1355 s8 += 16;
1356 }
1357
1358 gf_do_final_region_alignment(&rd);
1359 }
1360 #endif
1361
1362 static
1363 int gf_w32_bytwo_init(gf_t *gf)
1364 {
1365 gf_internal_t *h;
1366 uint64_t ip, m1, m2;
1367 struct gf_w32_bytwo_data *btd;
1368
1369 h = (gf_internal_t *) gf->scratch;
1370 btd = (struct gf_w32_bytwo_data *) (h->private);
1371 ip = h->prim_poly & 0xffffffff;
1372 m1 = 0xfffffffe;
1373 m2 = 0x80000000;
1374 btd->prim_poly = 0;
1375 btd->mask1 = 0;
1376 btd->mask2 = 0;
1377
1378 while (ip != 0) {
1379 btd->prim_poly |= ip;
1380 btd->mask1 |= m1;
1381 btd->mask2 |= m2;
1382 ip <<= GF_FIELD_WIDTH;
1383 m1 <<= GF_FIELD_WIDTH;
1384 m2 <<= GF_FIELD_WIDTH;
1385 }
1386
1387 if (h->mult_type == GF_MULT_BYTWO_p) {
1388 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
1389 #ifdef INTEL_SSE2
1390 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1391 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_sse_multiply_region)
1392 } else {
1393 #endif
1394 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_p_nosse_multiply_region)
1395 if(h->region_type & GF_REGION_SIMD)
1396 return 0;
1397 #ifdef INTEL_SSE2
1398 }
1399 #endif
1400 } else {
1401 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_b_multiply)
1402 #ifdef INTEL_SSE2
1403 if (gf_cpu_supports_intel_sse2 && !(h->region_type & GF_REGION_NOSIMD)) {
1404 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_sse_multiply_region)
1405 } else {
1406 #endif
1407 SET_FUNCTION(gf,multiply_region,w32,gf_w32_bytwo_b_nosse_multiply_region)
1408 if(h->region_type & GF_REGION_SIMD)
1409 return 0;
1410 #ifdef INTEL_SSE2
1411 }
1412 #endif
1413 }
1414
1415 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
1416 return 1;
1417 }
1418
1419 static
1420 inline
1421 uint32_t
1422 gf_w32_split_8_8_multiply (gf_t *gf, uint32_t a32, uint32_t b32)
1423 {
1424 uint32_t product, i, j, mask, tb;
1425 gf_internal_t *h;
1426 struct gf_w32_split_8_8_data *d8;
1427
1428 h = (gf_internal_t *) gf->scratch;
1429 d8 = (struct gf_w32_split_8_8_data *) h->private;
1430 product = 0;
1431 mask = 0xff;
1432
1433 for (i = 0; i < 4; i++) {
1434 tb = b32;
1435 for (j = 0; j < 4; j++) {
1436 product ^= d8->tables[i+j][a32&mask][tb&mask];
1437 tb >>= 8;
1438 }
1439 a32 >>= 8;
1440 }
1441 return product;
1442 }
1443
1444 static
1445 inline
1446 void
1447 gf_w32_split_8_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1448 {
1449 gf_internal_t *h;
1450 uint32_t *s32, *d32, *top, p, a, v;
1451 struct gf_split_8_32_lazy_data *d8;
1452 struct gf_w32_split_8_8_data *d88;
1453 uint32_t *t[4];
1454 int i, j, k, change;
1455 uint32_t pp;
1456 gf_region_data rd;
1457
1458 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1459 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1460
1461 h = (gf_internal_t *) gf->scratch;
1462 if (h->arg1 == 32 || h->arg2 == 32 || h->mult_type == GF_MULT_DEFAULT) {
1463 d8 = (struct gf_split_8_32_lazy_data *) h->private;
1464 for (i = 0; i < 4; i++) t[i] = d8->tables[i];
1465 change = (val != d8->last_value);
1466 if (change) d8->last_value = val;
1467 } else {
1468 d88 = (struct gf_w32_split_8_8_data *) h->private;
1469 for (i = 0; i < 4; i++) t[i] = d88->region_tables[i];
1470 change = (val != d88->last_value);
1471 if (change) d88->last_value = val;
1472 }
1473 pp = h->prim_poly;
1474
1475 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1476 gf_do_initial_region_alignment(&rd);
1477
1478 s32 = (uint32_t *) rd.s_start;
1479 d32 = (uint32_t *) rd.d_start;
1480 top = (uint32_t *) rd.d_top;
1481
1482 if (change) {
1483 v = val;
1484 for (i = 0; i < 4; i++) {
1485 t[i][0] = 0;
1486 for (j = 1; j < 256; j <<= 1) {
1487 for (k = 0; k < j; k++) {
1488 t[i][k^j] = (v ^ t[i][k]);
1489 }
1490 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1491 }
1492 }
1493 }
1494
1495 while (d32 < top) {
1496 p = (xor) ? *d32 : 0;
1497 a = *s32;
1498 i = 0;
1499 while (a != 0) {
1500 v = (a & 0xff);
1501 p ^= t[i][v];
1502 a >>= 8;
1503 i++;
1504 }
1505 *d32 = p;
1506 d32++;
1507 s32++;
1508 }
1509 gf_do_final_region_alignment(&rd);
1510 }
1511
1512 static
1513 inline
1514 void
1515 gf_w32_split_16_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1516 {
1517 gf_internal_t *h;
1518 uint32_t *s32, *d32, *top, p, a, v;
1519 struct gf_split_16_32_lazy_data *d16;
1520 uint32_t *t[2];
1521 int i, j, k, change;
1522 uint32_t pp;
1523 gf_region_data rd;
1524
1525 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1526 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1527
1528 h = (gf_internal_t *) gf->scratch;
1529 d16 = (struct gf_split_16_32_lazy_data *) h->private;
1530 for (i = 0; i < 2; i++) t[i] = d16->tables[i];
1531 change = (val != d16->last_value);
1532 if (change) d16->last_value = val;
1533
1534 pp = h->prim_poly;
1535
1536 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1537 gf_do_initial_region_alignment(&rd);
1538
1539 s32 = (uint32_t *) rd.s_start;
1540 d32 = (uint32_t *) rd.d_start;
1541 top = (uint32_t *) rd.d_top;
1542
1543 if (change) {
1544 v = val;
1545 for (i = 0; i < 2; i++) {
1546 t[i][0] = 0;
1547 for (j = 1; j < (1 << 16); j <<= 1) {
1548 for (k = 0; k < j; k++) {
1549 t[i][k^j] = (v ^ t[i][k]);
1550 }
1551 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1552 }
1553 }
1554 }
1555
1556 while (d32 < top) {
1557 p = (xor) ? *d32 : 0;
1558 a = *s32;
1559 i = 0;
1560 while (a != 0 && i < 2) {
1561 v = (a & 0xffff);
1562 p ^= t[i][v];
1563 a >>= 16;
1564 i++;
1565 }
1566 *d32 = p;
1567 d32++;
1568 s32++;
1569 }
1570 gf_do_final_region_alignment(&rd);
1571 }
1572
1573 static
1574 void
1575 gf_w32_split_2_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1576 {
1577 gf_internal_t *h;
1578 struct gf_split_2_32_lazy_data *ld;
1579 int i;
1580 uint32_t pp, v, v2, s, *s32, *d32, *top;
1581 gf_region_data rd;
1582
1583 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1584 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1585
1586 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1587 gf_do_initial_region_alignment(&rd);
1588
1589 h = (gf_internal_t *) gf->scratch;
1590 pp = h->prim_poly;
1591
1592 ld = (struct gf_split_2_32_lazy_data *) h->private;
1593
1594 if (ld->last_value != val) {
1595 v = val;
1596 for (i = 0; i < 16; i++) {
1597 v2 = (v << 1);
1598 if (v & GF_FIRST_BIT) v2 ^= pp;
1599 ld->tables[i][0] = 0;
1600 ld->tables[i][1] = v;
1601 ld->tables[i][2] = v2;
1602 ld->tables[i][3] = (v2 ^ v);
1603 v = (v2 << 1);
1604 if (v2 & GF_FIRST_BIT) v ^= pp;
1605 }
1606 }
1607 ld->last_value = val;
1608
1609 s32 = (uint32_t *) rd.s_start;
1610 d32 = (uint32_t *) rd.d_start;
1611 top = (uint32_t *) rd.d_top;
1612
1613 while (d32 != top) {
1614 v = (xor) ? *d32 : 0;
1615 s = *s32;
1616 i = 0;
1617 while (s != 0) {
1618 v ^= ld->tables[i][s&3];
1619 s >>= 2;
1620 i++;
1621 }
1622 *d32 = v;
1623 d32++;
1624 s32++;
1625 }
1626 gf_do_final_region_alignment(&rd);
1627 }
1628
1629 #ifdef INTEL_SSSE3
1630 static
1631 void
1632 gf_w32_split_2_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1633 {
1634 gf_internal_t *h;
1635 int i, tindex;
1636 uint32_t pp, v, v2, *s32, *d32, *top;
1637 __m128i vi, si, pi, shuffler, tables[16], adder, xi, mask1, mask2;
1638 gf_region_data rd;
1639
1640 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1641 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1642
1643 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
1644 gf_do_initial_region_alignment(&rd);
1645
1646 h = (gf_internal_t *) gf->scratch;
1647 pp = h->prim_poly;
1648
1649 s32 = (uint32_t *) rd.s_start;
1650 d32 = (uint32_t *) rd.d_start;
1651 top = (uint32_t *) rd.d_top;
1652
1653 v = val;
1654 for (i = 0; i < 16; i++) {
1655 v2 = (v << 1);
1656 if (v & GF_FIRST_BIT) v2 ^= pp;
1657 tables[i] = _mm_set_epi32(v2 ^ v, v2, v, 0);
1658 v = (v2 << 1);
1659 if (v2 & GF_FIRST_BIT) v ^= pp;
1660 }
1661
1662 shuffler = _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1663 adder = _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
1664 mask1 = _mm_set1_epi8(0x3);
1665 mask2 = _mm_set1_epi8(0xc);
1666
1667 while (d32 != top) {
1668 pi = (xor) ? _mm_load_si128 ((__m128i *) d32) : _mm_setzero_si128();
1669 vi = _mm_load_si128((__m128i *) s32);
1670
1671 tindex = 0;
1672 for (i = 0; i < 4; i++) {
1673 si = _mm_shuffle_epi8(vi, shuffler);
1674
1675 xi = _mm_and_si128(si, mask1);
1676 xi = _mm_slli_epi16(xi, 2);
1677 xi = _mm_xor_si128(xi, adder);
1678 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1679 tindex++;
1680
1681 xi = _mm_and_si128(si, mask2);
1682 xi = _mm_xor_si128(xi, adder);
1683 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1684 si = _mm_srli_epi16(si, 2);
1685 tindex++;
1686
1687 xi = _mm_and_si128(si, mask2);
1688 xi = _mm_xor_si128(xi, adder);
1689 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1690 si = _mm_srli_epi16(si, 2);
1691 tindex++;
1692
1693 xi = _mm_and_si128(si, mask2);
1694 xi = _mm_xor_si128(xi, adder);
1695 pi = _mm_xor_si128(pi, _mm_shuffle_epi8(tables[tindex], xi));
1696 tindex++;
1697
1698 vi = _mm_srli_epi32(vi, 8);
1699 }
1700 _mm_store_si128((__m128i *) d32, pi);
1701 d32 += 4;
1702 s32 += 4;
1703 }
1704
1705 gf_do_final_region_alignment(&rd);
1706
1707 }
1708 #endif
1709
1710 static
1711 void
1712 gf_w32_split_4_32_lazy_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1713 {
1714 gf_internal_t *h;
1715 struct gf_split_4_32_lazy_data *ld;
1716 int i, j, k;
1717 uint32_t pp, v, s, *s32, *d32, *top;
1718 gf_region_data rd;
1719
1720 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1721 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1722
1723 h = (gf_internal_t *) gf->scratch;
1724 pp = h->prim_poly;
1725
1726 ld = (struct gf_split_4_32_lazy_data *) h->private;
1727
1728 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
1729 gf_do_initial_region_alignment(&rd);
1730
1731 if (ld->last_value != val) {
1732 v = val;
1733 for (i = 0; i < 8; i++) {
1734 ld->tables[i][0] = 0;
1735 for (j = 1; j < 16; j <<= 1) {
1736 for (k = 0; k < j; k++) {
1737 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1738 }
1739 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1740 }
1741 }
1742 }
1743 ld->last_value = val;
1744
1745 s32 = (uint32_t *) rd.s_start;
1746 d32 = (uint32_t *) rd.d_start;
1747 top = (uint32_t *) rd.d_top;
1748
1749 while (d32 != top) {
1750 v = (xor) ? *d32 : 0;
1751 s = *s32;
1752 i = 0;
1753 while (s != 0) {
1754 v ^= ld->tables[i][s&0xf];
1755 s >>= 4;
1756 i++;
1757 }
1758 *d32 = v;
1759 d32++;
1760 s32++;
1761 }
1762 gf_do_final_region_alignment(&rd);
1763 }
1764
1765 #ifdef INTEL_SSSE3
1766 static
1767 void
1768 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1769 {
1770 gf_internal_t *h;
1771 int i, j, k;
1772 uint32_t pp, v, *s32, *d32, *top;
1773 __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3;
1774 struct gf_split_4_32_lazy_data *ld;
1775 uint8_t btable[16];
1776 gf_region_data rd;
1777
1778 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1779 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1780
1781 h = (gf_internal_t *) gf->scratch;
1782 pp = h->prim_poly;
1783
1784 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
1785 gf_do_initial_region_alignment(&rd);
1786
1787 s32 = (uint32_t *) rd.s_start;
1788 d32 = (uint32_t *) rd.d_start;
1789 top = (uint32_t *) rd.d_top;
1790
1791 ld = (struct gf_split_4_32_lazy_data *) h->private;
1792
1793 v = val;
1794 for (i = 0; i < 8; i++) {
1795 ld->tables[i][0] = 0;
1796 for (j = 1; j < 16; j <<= 1) {
1797 for (k = 0; k < j; k++) {
1798 ld->tables[i][k^j] = (v ^ ld->tables[i][k]);
1799 }
1800 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1801 }
1802 for (j = 0; j < 4; j++) {
1803 for (k = 0; k < 16; k++) {
1804 btable[k] = (uint8_t) ld->tables[i][k];
1805 ld->tables[i][k] >>= 8;
1806 }
1807 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1808 }
1809 }
1810
1811 mask1 = _mm_set1_epi8(0xf);
1812
1813 if (xor) {
1814 while (d32 != top) {
1815 p0 = _mm_load_si128 ((__m128i *) d32);
1816 p1 = _mm_load_si128 ((__m128i *) (d32+4));
1817 p2 = _mm_load_si128 ((__m128i *) (d32+8));
1818 p3 = _mm_load_si128 ((__m128i *) (d32+12));
1819
1820 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
1821 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
1822 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
1823 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
1824
1825 si = _mm_and_si128(v0, mask1);
1826 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
1827 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
1828 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
1829 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
1830
1831 v0 = _mm_srli_epi32(v0, 4);
1832 si = _mm_and_si128(v0, mask1);
1833 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
1834 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
1835 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
1836 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
1837
1838 si = _mm_and_si128(v1, mask1);
1839 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
1840 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
1841 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
1842 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
1843
1844 v1 = _mm_srli_epi32(v1, 4);
1845 si = _mm_and_si128(v1, mask1);
1846 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
1847 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
1848 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
1849 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
1850
1851 si = _mm_and_si128(v2, mask1);
1852 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
1853 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
1854 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
1855 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
1856
1857 v2 = _mm_srli_epi32(v2, 4);
1858 si = _mm_and_si128(v2, mask1);
1859 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
1860 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
1861 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
1862 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
1863
1864 si = _mm_and_si128(v3, mask1);
1865 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
1866 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
1867 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
1868 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
1869
1870 v3 = _mm_srli_epi32(v3, 4);
1871 si = _mm_and_si128(v3, mask1);
1872 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
1873 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
1874 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
1875 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
1876
1877 _mm_store_si128((__m128i *) d32, p0);
1878 _mm_store_si128((__m128i *) (d32+4), p1);
1879 _mm_store_si128((__m128i *) (d32+8), p2);
1880 _mm_store_si128((__m128i *) (d32+12), p3);
1881 d32 += 16;
1882 }
1883 } else {
1884 while (d32 != top) {
1885
1886 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
1887 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
1888 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
1889 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
1890
1891 si = _mm_and_si128(v0, mask1);
1892 p0 = _mm_shuffle_epi8(tables[0][0], si);
1893 p1 = _mm_shuffle_epi8(tables[0][1], si);
1894 p2 = _mm_shuffle_epi8(tables[0][2], si);
1895 p3 = _mm_shuffle_epi8(tables[0][3], si);
1896
1897 v0 = _mm_srli_epi32(v0, 4);
1898 si = _mm_and_si128(v0, mask1);
1899 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
1900 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
1901 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
1902 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
1903
1904 si = _mm_and_si128(v1, mask1);
1905 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
1906 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
1907 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
1908 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
1909
1910 v1 = _mm_srli_epi32(v1, 4);
1911 si = _mm_and_si128(v1, mask1);
1912 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
1913 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
1914 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
1915 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
1916
1917 si = _mm_and_si128(v2, mask1);
1918 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
1919 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
1920 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
1921 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
1922
1923 v2 = _mm_srli_epi32(v2, 4);
1924 si = _mm_and_si128(v2, mask1);
1925 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
1926 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
1927 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
1928 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
1929
1930 si = _mm_and_si128(v3, mask1);
1931 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[6][0], si));
1932 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[6][1], si));
1933 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[6][2], si));
1934 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[6][3], si));
1935
1936 v3 = _mm_srli_epi32(v3, 4);
1937 si = _mm_and_si128(v3, mask1);
1938 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
1939 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
1940 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
1941 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
1942
1943 _mm_store_si128((__m128i *) d32, p0);
1944 _mm_store_si128((__m128i *) (d32+4), p1);
1945 _mm_store_si128((__m128i *) (d32+8), p2);
1946 _mm_store_si128((__m128i *) (d32+12), p3);
1947 d32 += 16;
1948 }
1949 }
1950
1951 gf_do_final_region_alignment(&rd);
1952 }
1953 #endif
1954
1955
1956 #ifdef INTEL_SSSE3
1957 static
1958 void
1959 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
1960 {
1961 gf_internal_t *h;
1962 int i, j, k;
1963 uint32_t pp, v, *s32, *d32, *top, tmp_table[16];
1964 __m128i si, tables[8][4], p0, p1, p2, p3, mask1, v0, v1, v2, v3, mask8;
1965 __m128i tv1, tv2, tv3, tv0;
1966 uint8_t btable[16];
1967 gf_region_data rd;
1968
1969 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
1970 if (val == 1) { gf_multby_one(src, dest, bytes, xor); return; }
1971
1972 h = (gf_internal_t *) gf->scratch;
1973 pp = h->prim_poly;
1974
1975 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 64);
1976 gf_do_initial_region_alignment(&rd);
1977
1978 s32 = (uint32_t *) rd.s_start;
1979 d32 = (uint32_t *) rd.d_start;
1980 top = (uint32_t *) rd.d_top;
1981
1982 v = val;
1983 for (i = 0; i < 8; i++) {
1984 tmp_table[0] = 0;
1985 for (j = 1; j < 16; j <<= 1) {
1986 for (k = 0; k < j; k++) {
1987 tmp_table[k^j] = (v ^ tmp_table[k]);
1988 }
1989 v = (v & GF_FIRST_BIT) ? ((v << 1) ^ pp) : (v << 1);
1990 }
1991 for (j = 0; j < 4; j++) {
1992 for (k = 0; k < 16; k++) {
1993 btable[k] = (uint8_t) tmp_table[k];
1994 tmp_table[k] >>= 8;
1995 }
1996 tables[i][j] = _mm_loadu_si128((__m128i *) btable);
1997 }
1998 }
1999
2000 mask1 = _mm_set1_epi8(0xf);
2001 mask8 = _mm_set1_epi16(0xff);
2002
2003 if (xor) {
2004 while (d32 != top) {
2005 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
2006 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
2007 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
2008 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
2009
2010 p0 = _mm_srli_epi16(v0, 8);
2011 p1 = _mm_srli_epi16(v1, 8);
2012 p2 = _mm_srli_epi16(v2, 8);
2013 p3 = _mm_srli_epi16(v3, 8);
2014
2015 tv0 = _mm_and_si128(v0, mask8);
2016 tv1 = _mm_and_si128(v1, mask8);
2017 tv2 = _mm_and_si128(v2, mask8);
2018 tv3 = _mm_and_si128(v3, mask8);
2019
2020 v0 = _mm_packus_epi16(p1, p0);
2021 v1 = _mm_packus_epi16(tv1, tv0);
2022 v2 = _mm_packus_epi16(p3, p2);
2023 v3 = _mm_packus_epi16(tv3, tv2);
2024
2025 p0 = _mm_srli_epi16(v0, 8);
2026 p1 = _mm_srli_epi16(v1, 8);
2027 p2 = _mm_srli_epi16(v2, 8);
2028 p3 = _mm_srli_epi16(v3, 8);
2029
2030 tv0 = _mm_and_si128(v0, mask8);
2031 tv1 = _mm_and_si128(v1, mask8);
2032 tv2 = _mm_and_si128(v2, mask8);
2033 tv3 = _mm_and_si128(v3, mask8);
2034
2035 v0 = _mm_packus_epi16(p2, p0);
2036 v1 = _mm_packus_epi16(p3, p1);
2037 v2 = _mm_packus_epi16(tv2, tv0);
2038 v3 = _mm_packus_epi16(tv3, tv1);
2039
2040 si = _mm_and_si128(v0, mask1);
2041 p0 = _mm_shuffle_epi8(tables[6][0], si);
2042 p1 = _mm_shuffle_epi8(tables[6][1], si);
2043 p2 = _mm_shuffle_epi8(tables[6][2], si);
2044 p3 = _mm_shuffle_epi8(tables[6][3], si);
2045
2046 v0 = _mm_srli_epi32(v0, 4);
2047 si = _mm_and_si128(v0, mask1);
2048 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
2049 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
2050 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
2051 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
2052
2053 si = _mm_and_si128(v1, mask1);
2054 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
2055 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
2056 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
2057 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
2058
2059 v1 = _mm_srli_epi32(v1, 4);
2060 si = _mm_and_si128(v1, mask1);
2061 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
2062 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
2063 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
2064 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
2065
2066 si = _mm_and_si128(v2, mask1);
2067 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
2068 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
2069 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
2070 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
2071
2072 v2 = _mm_srli_epi32(v2, 4);
2073 si = _mm_and_si128(v2, mask1);
2074 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
2075 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
2076 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
2077 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
2078
2079 si = _mm_and_si128(v3, mask1);
2080 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
2081 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
2082 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
2083 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
2084
2085 v3 = _mm_srli_epi32(v3, 4);
2086 si = _mm_and_si128(v3, mask1);
2087 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
2088 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
2089 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
2090 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
2091
2092 tv0 = _mm_unpackhi_epi8(p1, p3);
2093 tv1 = _mm_unpackhi_epi8(p0, p2);
2094 tv2 = _mm_unpacklo_epi8(p1, p3);
2095 tv3 = _mm_unpacklo_epi8(p0, p2);
2096
2097 p0 = _mm_unpackhi_epi8(tv1, tv0);
2098 p1 = _mm_unpacklo_epi8(tv1, tv0);
2099 p2 = _mm_unpackhi_epi8(tv3, tv2);
2100 p3 = _mm_unpacklo_epi8(tv3, tv2);
2101
2102 v0 = _mm_load_si128 ((__m128i *) d32);
2103 v1 = _mm_load_si128 ((__m128i *) (d32+4));
2104 v2 = _mm_load_si128 ((__m128i *) (d32+8));
2105 v3 = _mm_load_si128 ((__m128i *) (d32+12));
2106
2107 p0 = _mm_xor_si128(p0, v0);
2108 p1 = _mm_xor_si128(p1, v1);
2109 p2 = _mm_xor_si128(p2, v2);
2110 p3 = _mm_xor_si128(p3, v3);
2111
2112 _mm_store_si128((__m128i *) d32, p0);
2113 _mm_store_si128((__m128i *) (d32+4), p1);
2114 _mm_store_si128((__m128i *) (d32+8), p2);
2115 _mm_store_si128((__m128i *) (d32+12), p3);
2116 d32 += 16;
2117 }
2118 } else {
2119 while (d32 != top) {
2120 v0 = _mm_load_si128((__m128i *) s32); s32 += 4;
2121 v1 = _mm_load_si128((__m128i *) s32); s32 += 4;
2122 v2 = _mm_load_si128((__m128i *) s32); s32 += 4;
2123 v3 = _mm_load_si128((__m128i *) s32); s32 += 4;
2124
2125 p0 = _mm_srli_epi16(v0, 8);
2126 p1 = _mm_srli_epi16(v1, 8);
2127 p2 = _mm_srli_epi16(v2, 8);
2128 p3 = _mm_srli_epi16(v3, 8);
2129
2130 tv0 = _mm_and_si128(v0, mask8);
2131 tv1 = _mm_and_si128(v1, mask8);
2132 tv2 = _mm_and_si128(v2, mask8);
2133 tv3 = _mm_and_si128(v3, mask8);
2134
2135 v0 = _mm_packus_epi16(p1, p0);
2136 v1 = _mm_packus_epi16(tv1, tv0);
2137 v2 = _mm_packus_epi16(p3, p2);
2138 v3 = _mm_packus_epi16(tv3, tv2);
2139
2140 p0 = _mm_srli_epi16(v0, 8);
2141 p1 = _mm_srli_epi16(v1, 8);
2142 p2 = _mm_srli_epi16(v2, 8);
2143 p3 = _mm_srli_epi16(v3, 8);
2144
2145 tv0 = _mm_and_si128(v0, mask8);
2146 tv1 = _mm_and_si128(v1, mask8);
2147 tv2 = _mm_and_si128(v2, mask8);
2148 tv3 = _mm_and_si128(v3, mask8);
2149
2150 v0 = _mm_packus_epi16(p2, p0);
2151 v1 = _mm_packus_epi16(p3, p1);
2152 v2 = _mm_packus_epi16(tv2, tv0);
2153 v3 = _mm_packus_epi16(tv3, tv1);
2154
2155 si = _mm_and_si128(v0, mask1);
2156 p0 = _mm_shuffle_epi8(tables[6][0], si);
2157 p1 = _mm_shuffle_epi8(tables[6][1], si);
2158 p2 = _mm_shuffle_epi8(tables[6][2], si);
2159 p3 = _mm_shuffle_epi8(tables[6][3], si);
2160
2161 v0 = _mm_srli_epi32(v0, 4);
2162 si = _mm_and_si128(v0, mask1);
2163 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[7][0], si));
2164 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[7][1], si));
2165 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[7][2], si));
2166 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[7][3], si));
2167
2168 si = _mm_and_si128(v1, mask1);
2169 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[4][0], si));
2170 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[4][1], si));
2171 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[4][2], si));
2172 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[4][3], si));
2173
2174 v1 = _mm_srli_epi32(v1, 4);
2175 si = _mm_and_si128(v1, mask1);
2176 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[5][0], si));
2177 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[5][1], si));
2178 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[5][2], si));
2179 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[5][3], si));
2180
2181 si = _mm_and_si128(v2, mask1);
2182 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[2][0], si));
2183 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[2][1], si));
2184 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[2][2], si));
2185 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[2][3], si));
2186
2187 v2 = _mm_srli_epi32(v2, 4);
2188 si = _mm_and_si128(v2, mask1);
2189 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[3][0], si));
2190 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[3][1], si));
2191 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[3][2], si));
2192 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[3][3], si));
2193
2194 si = _mm_and_si128(v3, mask1);
2195 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[0][0], si));
2196 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[0][1], si));
2197 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[0][2], si));
2198 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[0][3], si));
2199
2200 v3 = _mm_srli_epi32(v3, 4);
2201 si = _mm_and_si128(v3, mask1);
2202 p0 = _mm_xor_si128(p0, _mm_shuffle_epi8(tables[1][0], si));
2203 p1 = _mm_xor_si128(p1, _mm_shuffle_epi8(tables[1][1], si));
2204 p2 = _mm_xor_si128(p2, _mm_shuffle_epi8(tables[1][2], si));
2205 p3 = _mm_xor_si128(p3, _mm_shuffle_epi8(tables[1][3], si));
2206
2207 tv0 = _mm_unpackhi_epi8(p1, p3);
2208 tv1 = _mm_unpackhi_epi8(p0, p2);
2209 tv2 = _mm_unpacklo_epi8(p1, p3);
2210 tv3 = _mm_unpacklo_epi8(p0, p2);
2211
2212 p0 = _mm_unpackhi_epi8(tv1, tv0);
2213 p1 = _mm_unpacklo_epi8(tv1, tv0);
2214 p2 = _mm_unpackhi_epi8(tv3, tv2);
2215 p3 = _mm_unpacklo_epi8(tv3, tv2);
2216
2217 _mm_store_si128((__m128i *) d32, p0);
2218 _mm_store_si128((__m128i *) (d32+4), p1);
2219 _mm_store_si128((__m128i *) (d32+8), p2);
2220 _mm_store_si128((__m128i *) (d32+12), p3);
2221 d32 += 16;
2222 }
2223 }
2224 gf_do_final_region_alignment(&rd);
2225 }
2226 #endif
2227
2228 static
2229 int gf_w32_split_init(gf_t *gf)
2230 {
2231 gf_internal_t *h;
2232 struct gf_split_2_32_lazy_data *ld2;
2233 struct gf_split_4_32_lazy_data *ld4;
2234 struct gf_w32_split_8_8_data *d8;
2235 struct gf_split_8_32_lazy_data *d32;
2236 struct gf_split_16_32_lazy_data *d16;
2237 uint32_t p, basep;
2238 int i, j, exp;
2239
2240 h = (gf_internal_t *) gf->scratch;
2241
2242 /* Defaults */
2243
2244 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2245
2246 /* JSP: First handle single multiplication:
2247 If args == 8, then we're doing split 8 8.
2248 Otherwise, if PCLMUL, we use that.
2249 Otherwise, we use bytwo_p.
2250 */
2251
2252 if (h->arg1 == 8 && h->arg2 == 8) {
2253 SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
2254 #if defined(INTEL_SSE4_PCLMUL)
2255 } else if (gf_cpu_supports_intel_pclmul) {
2256 if ((0xfffe0000 & h->prim_poly) == 0){
2257 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_2)
2258 } else if ((0xffc00000 & h->prim_poly) == 0){
2259 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_3)
2260 } else if ((0xfe000000 & h->prim_poly) == 0){
2261 SET_FUNCTION(gf,multiply,w32,gf_w32_clm_multiply_4)
2262 }
2263 #endif
2264 } else {
2265 SET_FUNCTION(gf,multiply,w32,gf_w32_bytwo_p_multiply)
2266 }
2267
2268 /* Easy cases: 16/32 and 2/32 */
2269
2270 if ((h->arg1 == 16 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 16)) {
2271 d16 = (struct gf_split_16_32_lazy_data *) h->private;
2272 d16->last_value = 0;
2273 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_16_32_lazy_multiply_region)
2274 return 1;
2275 }
2276
2277 if ((h->arg1 == 2 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 2)) {
2278 ld2 = (struct gf_split_2_32_lazy_data *) h->private;
2279 ld2->last_value = 0;
2280 #ifdef INTEL_SSSE3
2281 if (gf_cpu_supports_intel_ssse3 && !(h->region_type & GF_REGION_NOSIMD)) {
2282 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_sse_multiply_region)
2283 } else {
2284 #endif
2285 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_2_32_lazy_multiply_region)
2286 if(h->region_type & GF_REGION_SIMD) return 0;
2287 #ifdef INTEL_SSSE3
2288 }
2289 #endif
2290 return 1;
2291 }
2292
2293 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
2294
2295
2296 if ((h->arg1 == 4 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 4) ||
2297 ((gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon) && h->mult_type == GF_REGION_DEFAULT)) {
2298 ld4 = (struct gf_split_4_32_lazy_data *) h->private;
2299 ld4->last_value = 0;
2300 if ((h->region_type & GF_REGION_NOSIMD) || !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon)) {
2301 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_multiply_region)
2302 } else if (gf_cpu_supports_arm_neon) {
2303 #ifdef ARM_NEON
2304 gf_w32_neon_split_init(gf);
2305 #endif
2306 } else if (h->region_type & GF_REGION_ALTMAP) {
2307 #ifdef INTEL_SSSE3
2308 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_altmap_multiply_region)
2309 #endif
2310 } else {
2311 #ifdef INTEL_SSSE3
2312 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_4_32_lazy_sse_multiply_region)
2313 #endif
2314 }
2315 return 1;
2316 }
2317
2318 /* 8/32 or Default + no SSE */
2319
2320 if ((h->arg1 == 8 && h->arg2 == 32) || (h->arg1 == 32 && h->arg2 == 8) ||
2321 h->mult_type == GF_MULT_DEFAULT) {
2322 d32 = (struct gf_split_8_32_lazy_data *) h->private;
2323 d32->last_value = 0;
2324 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
2325 return 1;
2326 }
2327
2328 /* Finally, if args == 8, then we have to set up the tables here. */
2329
2330 if (h->arg1 == 8 && h->arg2 == 8) {
2331 d8 = (struct gf_w32_split_8_8_data *) h->private;
2332 d8->last_value = 0;
2333 SET_FUNCTION(gf,multiply,w32,gf_w32_split_8_8_multiply)
2334 SET_FUNCTION(gf,multiply_region,w32,gf_w32_split_8_32_lazy_multiply_region)
2335 basep = 1;
2336 for (exp = 0; exp < 7; exp++) {
2337 for (j = 0; j < 256; j++) d8->tables[exp][0][j] = 0;
2338 for (i = 0; i < 256; i++) d8->tables[exp][i][0] = 0;
2339 d8->tables[exp][1][1] = basep;
2340 for (i = 2; i < 256; i++) {
2341 if (i&1) {
2342 p = d8->tables[exp][i^1][1];
2343 d8->tables[exp][i][1] = p ^ basep;
2344 } else {
2345 p = d8->tables[exp][i>>1][1];
2346 d8->tables[exp][i][1] = GF_MULTBY_TWO(p);
2347 }
2348 }
2349 for (i = 1; i < 256; i++) {
2350 p = d8->tables[exp][i][1];
2351 for (j = 1; j < 256; j++) {
2352 if (j&1) {
2353 d8->tables[exp][i][j] = d8->tables[exp][i][j^1] ^ p;
2354 } else {
2355 d8->tables[exp][i][j] = GF_MULTBY_TWO(d8->tables[exp][i][j>>1]);
2356 }
2357 }
2358 }
2359 for (i = 0; i < 8; i++) basep = GF_MULTBY_TWO(basep);
2360 }
2361 return 1;
2362 }
2363
2364 /* If we get here, then the arguments were bad. */
2365
2366 return 0;
2367 }
2368
2369 static
2370 int gf_w32_group_init(gf_t *gf)
2371 {
2372 uint32_t i, j, p, index;
2373 struct gf_w32_group_data *gd;
2374 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2375 uint32_t g_r, g_s;
2376
2377 g_s = h->arg1;
2378 g_r = h->arg2;
2379
2380 gd = (struct gf_w32_group_data *) h->private;
2381 gd->shift = (uint32_t *) (&(gd->memory));
2382 gd->reduce = gd->shift + (1 << g_s);
2383
2384 gd->rmask = (1 << g_r) - 1;
2385 gd->rmask <<= 32;
2386
2387 gd->tshift = 32 % g_s;
2388 if (gd->tshift == 0) gd->tshift = g_s;
2389 gd->tshift = (32 - gd->tshift);
2390 gd->tshift = ((gd->tshift-1)/g_r) * g_r;
2391
2392 gd->reduce[0] = 0;
2393 for (i = 0; i < ((uint32_t)1 << g_r); i++) {
2394 p = 0;
2395 index = 0;
2396 for (j = 0; j < g_r; j++) {
2397 if (i & (1 << j)) {
2398 p ^= (h->prim_poly << j);
2399 index ^= (1 << j);
2400 index ^= (h->prim_poly >> (32-j));
2401 }
2402 }
2403 gd->reduce[index] = p;
2404 }
2405
2406 if (g_s == g_r) {
2407 SET_FUNCTION(gf,multiply,w32,gf_w32_group_s_equals_r_multiply)
2408 SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_s_equals_r_multiply_region)
2409 } else {
2410 SET_FUNCTION(gf,multiply,w32,gf_w32_group_multiply)
2411 SET_FUNCTION(gf,multiply_region,w32,gf_w32_group_multiply_region)
2412 }
2413 SET_FUNCTION(gf,divide,w32,NULL)
2414 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2415
2416 return 1;
2417 }
2418
2419
2420 static
2421 uint32_t
2422 gf_w32_composite_multiply_recursive(gf_t *gf, uint32_t a, uint32_t b)
2423 {
2424 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2425 gf_t *base_gf = h->base_gf;
2426 uint32_t b0 = b & 0x0000ffff;
2427 uint32_t b1 = (b & 0xffff0000) >> 16;
2428 uint32_t a0 = a & 0x0000ffff;
2429 uint32_t a1 = (a & 0xffff0000) >> 16;
2430 uint32_t a1b1;
2431 uint32_t rv;
2432 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2433
2434 rv = ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16) | (base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1);
2435 return rv;
2436 }
2437
2438 /* JSP: This could be made faster. Someday, when I'm bored. */
2439
2440 static
2441 uint32_t
2442 gf_w32_composite_multiply_inline(gf_t *gf, uint32_t a, uint32_t b)
2443 {
2444 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2445 uint32_t b0 = b & 0x0000ffff;
2446 uint32_t b1 = b >> 16;
2447 uint32_t a0 = a & 0x0000ffff;
2448 uint32_t a1 = a >> 16;
2449 uint32_t a1b1, prod;
2450 uint16_t *log, *alog;
2451 struct gf_w32_composite_data *cd;
2452
2453 cd = (struct gf_w32_composite_data *) h->private;
2454 log = cd->log;
2455 alog = cd->alog;
2456
2457 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2458 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2459 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2460 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2461 prod <<= 16;
2462 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2463 prod ^= a1b1;
2464 return prod;
2465 }
2466
2467 /*
2468 * Composite field division trick (explained in 2007 tech report)
2469 *
2470 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
2471 *
2472 * let c = b^-1
2473 *
2474 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
2475 *
2476 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
2477 *
2478 * let d = b1c1 and d+1 = b0c0
2479 *
2480 * solve s*b1c1+b1c0+b0c1 = 0
2481 *
2482 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
2483 *
2484 * c0 = (d+1)b0^-1
2485 * c1 = d*b1^-1
2486 *
2487 * a / b = a * c
2488 */
2489
2490 static
2491 uint32_t
2492 gf_w32_composite_inverse(gf_t *gf, uint32_t a)
2493 {
2494 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2495 gf_t *base_gf = h->base_gf;
2496 uint16_t a0 = a & 0x0000ffff;
2497 uint16_t a1 = (a & 0xffff0000) >> 16;
2498 uint16_t c0, c1, d, tmp;
2499 uint32_t c;
2500 uint16_t a0inv, a1inv;
2501
2502 if (a0 == 0) {
2503 a1inv = base_gf->inverse.w32(base_gf, a1);
2504 c0 = base_gf->multiply.w32(base_gf, a1inv, h->prim_poly);
2505 c1 = a1inv;
2506 } else if (a1 == 0) {
2507 c0 = base_gf->inverse.w32(base_gf, a0);
2508 c1 = 0;
2509 } else {
2510 a1inv = base_gf->inverse.w32(base_gf, a1);
2511 a0inv = base_gf->inverse.w32(base_gf, a0);
2512
2513 d = base_gf->multiply.w32(base_gf, a1, a0inv);
2514
2515 tmp = (base_gf->multiply.w32(base_gf, a1, a0inv) ^ base_gf->multiply.w32(base_gf, a0, a1inv) ^ h->prim_poly);
2516 tmp = base_gf->inverse.w32(base_gf, tmp);
2517
2518 d = base_gf->multiply.w32(base_gf, d, tmp);
2519
2520 c0 = base_gf->multiply.w32(base_gf, (d^1), a0inv);
2521 c1 = base_gf->multiply.w32(base_gf, d, a1inv);
2522 }
2523
2524 c = c0 | (c1 << 16);
2525
2526 return c;
2527 }
2528
2529 static
2530 void
2531 gf_w32_composite_multiply_region(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
2532 {
2533 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2534 gf_t *base_gf = h->base_gf;
2535 uint32_t b0 = val & 0x0000ffff;
2536 uint32_t b1 = (val & 0xffff0000) >> 16;
2537 uint32_t *s32, *d32, *top;
2538 uint16_t a0, a1, a1b1, *log, *alog;
2539 uint32_t prod;
2540 gf_region_data rd;
2541 struct gf_w32_composite_data *cd;
2542
2543 cd = (struct gf_w32_composite_data *) h->private;
2544 log = cd->log;
2545 alog = cd->alog;
2546
2547 if (val == 0) { gf_multby_zero(dest, bytes, xor); return; }
2548 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 4);
2549
2550 s32 = rd.s_start;
2551 d32 = rd.d_start;
2552 top = rd.d_top;
2553
2554 if (log == NULL) {
2555 if (xor) {
2556 while (d32 < top) {
2557 a0 = *s32 & 0x0000ffff;
2558 a1 = (*s32 & 0xffff0000) >> 16;
2559 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2560
2561 *d32 ^= ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2562 ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
2563 s32++;
2564 d32++;
2565 }
2566 } else {
2567 while (d32 < top) {
2568 a0 = *s32 & 0x0000ffff;
2569 a1 = (*s32 & 0xffff0000) >> 16;
2570 a1b1 = base_gf->multiply.w32(base_gf, a1, b1);
2571
2572 *d32 = ((base_gf->multiply.w32(base_gf, a0, b0) ^ a1b1) |
2573 ((base_gf->multiply.w32(base_gf, a1, b0) ^ base_gf->multiply.w32(base_gf, a0, b1) ^ base_gf->multiply.w32(base_gf, a1b1, h->prim_poly)) << 16));
2574 s32++;
2575 d32++;
2576 }
2577 }
2578 } else {
2579 if (xor) {
2580 while (d32 < top) {
2581 a0 = *s32 & 0x0000ffff;
2582 a1 = (*s32 & 0xffff0000) >> 16;
2583 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2584
2585 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2586 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2587 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2588 prod <<= 16;
2589 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2590 prod ^= a1b1;
2591 *d32 ^= prod;
2592 s32++;
2593 d32++;
2594 }
2595 } else {
2596 while (d32 < top) {
2597 a0 = *s32 & 0x0000ffff;
2598 a1 = (*s32 & 0xffff0000) >> 16;
2599 a1b1 = GF_W16_INLINE_MULT(log, alog, a1, b1);
2600
2601 prod = GF_W16_INLINE_MULT(log, alog, a1, b0);
2602 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b1);
2603 prod ^= GF_W16_INLINE_MULT(log, alog, a1b1, h->prim_poly);
2604 prod <<= 16;
2605 prod ^= GF_W16_INLINE_MULT(log, alog, a0, b0);
2606 prod ^= a1b1;
2607
2608 *d32 = prod;
2609 s32++;
2610 d32++;
2611 }
2612 }
2613 }
2614 }
2615
2616 static
2617 void
2618 gf_w32_composite_multiply_region_alt(gf_t *gf, void *src, void *dest, uint32_t val, int bytes, int xor)
2619 {
2620 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2621 gf_t *base_gf = h->base_gf;
2622 uint16_t val0 = val & 0x0000ffff;
2623 uint16_t val1 = (val & 0xffff0000) >> 16;
2624 gf_region_data rd;
2625 int sub_reg_size;
2626 uint8_t *slow, *shigh;
2627 uint8_t *dlow, *dhigh, *top;
2628
2629 /* JSP: I want the two pointers aligned wrt each other on 16 byte
2630 boundaries. So I'm going to make sure that the area on
2631 which the two operate is a multiple of 32. Of course, that
2632 junks up the mapping, but so be it -- that's why we have extract_word.... */
2633
2634 gf_set_region_data(&rd, gf, src, dest, bytes, val, xor, 32);
2635 gf_do_initial_region_alignment(&rd);
2636
2637 slow = (uint8_t *) rd.s_start;
2638 dlow = (uint8_t *) rd.d_start;
2639 top = (uint8_t *) rd.d_top;
2640 sub_reg_size = (top - dlow)/2;
2641 shigh = slow + sub_reg_size;
2642 dhigh = dlow + sub_reg_size;
2643
2644 base_gf->multiply_region.w32(base_gf, slow, dlow, val0, sub_reg_size, xor);
2645 base_gf->multiply_region.w32(base_gf, shigh, dlow, val1, sub_reg_size, 1);
2646 base_gf->multiply_region.w32(base_gf, slow, dhigh, val1, sub_reg_size, xor);
2647 base_gf->multiply_region.w32(base_gf, shigh, dhigh, val0, sub_reg_size, 1);
2648 base_gf->multiply_region.w32(base_gf, shigh, dhigh, base_gf->multiply.w32(base_gf, h->prim_poly, val1), sub_reg_size, 1);
2649
2650 gf_do_final_region_alignment(&rd);
2651 }
2652
2653 static
2654 int gf_w32_composite_init(gf_t *gf)
2655 {
2656 gf_internal_t *h = (gf_internal_t *) gf->scratch;
2657 struct gf_w32_composite_data *cd;
2658
2659 if (h->base_gf == NULL) return 0;
2660
2661 cd = (struct gf_w32_composite_data *) h->private;
2662 cd->log = gf_w16_get_log_table(h->base_gf);
2663 cd->alog = gf_w16_get_mult_alog_table(h->base_gf);
2664
2665 if (h->region_type & GF_REGION_ALTMAP) {
2666 SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region_alt)
2667 } else {
2668 SET_FUNCTION(gf,multiply_region,w32,gf_w32_composite_multiply_region)
2669 }
2670
2671 if (cd->log == NULL) {
2672 SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_recursive)
2673 } else {
2674 SET_FUNCTION(gf,multiply,w32,gf_w32_composite_multiply_inline)
2675 }
2676 SET_FUNCTION(gf,divide,w32,NULL)
2677 SET_FUNCTION(gf,inverse,w32,gf_w32_composite_inverse)
2678
2679 return 1;
2680 }
2681
2682
2683
2684 int gf_w32_scratch_size(int mult_type, int region_type, int divide_type, int arg1, int arg2)
2685 {
2686 switch(mult_type)
2687 {
2688 case GF_MULT_BYTWO_p:
2689 case GF_MULT_BYTWO_b:
2690 return sizeof(gf_internal_t) + sizeof(struct gf_w32_bytwo_data) + 64;
2691 break;
2692 case GF_MULT_GROUP:
2693 return sizeof(gf_internal_t) + sizeof(struct gf_w32_group_data) +
2694 sizeof(uint32_t) * (1 << arg1) +
2695 sizeof(uint32_t) * (1 << arg2) + 64;
2696 break;
2697 case GF_MULT_DEFAULT:
2698
2699 case GF_MULT_SPLIT_TABLE:
2700 if (arg1 == 8 && arg2 == 8){
2701 return sizeof(gf_internal_t) + sizeof(struct gf_w32_split_8_8_data) + 64;
2702 }
2703 if ((arg1 == 16 && arg2 == 32) || (arg2 == 16 && arg1 == 32)) {
2704 return sizeof(gf_internal_t) + sizeof(struct gf_split_16_32_lazy_data) + 64;
2705 }
2706 if ((arg1 == 2 && arg2 == 32) || (arg2 == 2 && arg1 == 32)) {
2707 return sizeof(gf_internal_t) + sizeof(struct gf_split_2_32_lazy_data) + 64;
2708 }
2709 if ((arg1 == 8 && arg2 == 32) || (arg2 == 8 && arg1 == 32) ||
2710 (mult_type == GF_MULT_DEFAULT && !(gf_cpu_supports_intel_ssse3 || gf_cpu_supports_arm_neon))) {
2711 return sizeof(gf_internal_t) + sizeof(struct gf_split_8_32_lazy_data) + 64;
2712 }
2713 if ((arg1 == 4 && arg2 == 32) ||
2714 (arg2 == 4 && arg1 == 32) ||
2715 mult_type == GF_MULT_DEFAULT) {
2716 return sizeof(gf_internal_t) + sizeof(struct gf_split_4_32_lazy_data) + 64;
2717 }
2718 return 0;
2719 case GF_MULT_CARRY_FREE:
2720 return sizeof(gf_internal_t);
2721 break;
2722 case GF_MULT_CARRY_FREE_GK:
2723 return sizeof(gf_internal_t) + sizeof(uint64_t)*2;
2724 break;
2725 case GF_MULT_SHIFT:
2726 return sizeof(gf_internal_t);
2727 break;
2728 case GF_MULT_COMPOSITE:
2729 return sizeof(gf_internal_t) + sizeof(struct gf_w32_composite_data) + 64;
2730 break;
2731
2732 default:
2733 return 0;
2734 }
2735 return 0;
2736 }
2737
2738 int gf_w32_init(gf_t *gf)
2739 {
2740 gf_internal_t *h;
2741
2742 h = (gf_internal_t *) gf->scratch;
2743
2744 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2745
2746 if (h->prim_poly == 0) {
2747 if (h->mult_type == GF_MULT_COMPOSITE) {
2748 h->prim_poly = gf_composite_get_default_poly(h->base_gf);
2749 if (h->prim_poly == 0) return 0; /* This shouldn't happen */
2750 } else {
2751
2752 /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
2753
2754 /* h->prim_poly = 0xc5; */
2755
2756 /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
2757
2758 h->prim_poly = 0x400007;
2759 }
2760 }
2761
2762 /* No leading one */
2763
2764 if(h->mult_type != GF_MULT_COMPOSITE) h->prim_poly &= 0xffffffff;
2765
2766 SET_FUNCTION(gf,multiply,w32,NULL)
2767 SET_FUNCTION(gf,divide,w32,NULL)
2768 SET_FUNCTION(gf,inverse,w32,NULL)
2769 SET_FUNCTION(gf,multiply_region,w32,NULL)
2770
2771 switch(h->mult_type) {
2772 case GF_MULT_CARRY_FREE: if (gf_w32_cfm_init(gf) == 0) return 0; break;
2773 case GF_MULT_CARRY_FREE_GK: if (gf_w32_cfmgk_init(gf) == 0) return 0; break;
2774 case GF_MULT_SHIFT: if (gf_w32_shift_init(gf) == 0) return 0; break;
2775 case GF_MULT_COMPOSITE: if (gf_w32_composite_init(gf) == 0) return 0; break;
2776 case GF_MULT_DEFAULT:
2777 case GF_MULT_SPLIT_TABLE: if (gf_w32_split_init(gf) == 0) return 0; break;
2778 case GF_MULT_GROUP: if (gf_w32_group_init(gf) == 0) return 0; break;
2779 case GF_MULT_BYTWO_p:
2780 case GF_MULT_BYTWO_b: if (gf_w32_bytwo_init(gf) == 0) return 0; break;
2781 default: return 0;
2782 }
2783 if (h->divide_type == GF_DIVIDE_EUCLID) {
2784 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2785 SET_FUNCTION(gf,inverse,w32,gf_w32_euclid)
2786 } else if (h->divide_type == GF_DIVIDE_MATRIX) {
2787 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2788 SET_FUNCTION(gf,inverse,w32,gf_w32_matrix)
2789 }
2790
2791 if (gf->inverse.w32 != NULL && gf->divide.w32 == NULL) {
2792 SET_FUNCTION(gf,divide,w32,gf_w32_divide_from_inverse)
2793 }
2794 if (gf->inverse.w32 == NULL && gf->divide.w32 != NULL) {
2795 SET_FUNCTION(gf,inverse,w32,gf_w32_inverse_from_divide)
2796 }
2797 if (h->region_type == GF_REGION_CAUCHY) {
2798 SET_FUNCTION(gf,extract_word,w32,gf_wgen_extract_word)
2799 SET_FUNCTION(gf,multiply_region,w32,gf_wgen_cauchy_region)
2800 } else if (h->region_type & GF_REGION_ALTMAP) {
2801 if (h->mult_type == GF_MULT_COMPOSITE) {
2802 SET_FUNCTION(gf,extract_word,w32,gf_w32_composite_extract_word)
2803 } else {
2804 SET_FUNCTION(gf,extract_word,w32,gf_w32_split_extract_word)
2805 }
2806 } else {
2807 SET_FUNCTION(gf,extract_word,w32,gf_w32_extract_word)
2808 }
2809 return 1;
2810 }