2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
8 * Routines for 32-bit Galois fields
18 #define MM_PRINT32(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 4) printf(" %02x%02x%02x%02x", blah[15-ii], blah[14-ii], blah[13-ii], blah[12-ii]); printf("\n"); }
20 #define MM_PRINT8(s, r) { uint8_t blah[16], ii; printf("%-12s", s); _mm_storeu_si128((__m128i *)blah, r); for (ii = 0; ii < 16; ii += 1) printf("%s%02x", (ii%4==0) ? " " : " ", blah[15-ii]); printf("\n"); }
22 #define AB2(ip, am1 ,am2, b, t1, t2) {\
25 t2 = ((t2 << 1) - (t2 >> (GF_FIELD_WIDTH-1))); \
26 b = (t1 ^ (t2 & ip));}
28 #define SSE_AB2(pp, m1 ,m2, va, t1, t2) {\
29 t1 = _mm_and_si128(_mm_slli_epi64(va, 1), m1); \
30 t2 = _mm_and_si128(va, m2); \
31 t2 = _mm_sub_epi64 (_mm_slli_epi64(t2, 1), _mm_srli_epi64(t2, (GF_FIELD_WIDTH-1))); \
32 va = _mm_xor_si128(t1, _mm_and_si128(t2, pp)); }
36 uint32_t gf_w32_inverse_from_divide (gf_t
*gf
, uint32_t a
)
38 return gf
->divide
.w32(gf
, 1, a
);
43 uint32_t gf_w32_divide_from_inverse (gf_t
*gf
, uint32_t a
, uint32_t b
)
45 b
= gf
->inverse
.w32(gf
, b
);
46 return gf
->multiply
.w32(gf
, a
, b
);
51 gf_w32_multiply_region_from_single(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int
58 s32
= (uint32_t *) src
;
59 d32
= (uint32_t *) dest
;
62 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
63 d32
[i
] ^= gf
->multiply
.w32(gf
, val
, s32
[i
]);
66 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
67 d32
[i
] = gf
->multiply
.w32(gf
, val
, s32
[i
]);
72 #if defined(INTEL_SSE4_PCLMUL)
76 gf_w32_clm_multiply_region_from_single_2(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
87 gf_internal_t
* h
= gf
->scratch
;
89 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
91 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
92 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
94 a
= _mm_insert_epi32 (_mm_setzero_si128(), val
, 0);
95 s32
= (uint32_t *) src
;
96 d32
= (uint32_t *) dest
;
99 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
100 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
101 result
= _mm_clmulepi64_si128 (a
, b
, 0);
102 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
103 result
= _mm_xor_si128 (result
, w
);
104 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
105 result
= _mm_xor_si128 (result
, w
);
106 d32
[i
] ^= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
109 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
110 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
111 result
= _mm_clmulepi64_si128 (a
, b
, 0);
112 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
113 result
= _mm_xor_si128 (result
, w
);
114 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
115 result
= _mm_xor_si128 (result
, w
);
116 d32
[i
] = ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
122 #if defined(INTEL_SSE4_PCLMUL)
126 gf_w32_clm_multiply_region_from_single_3(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
137 gf_internal_t
* h
= gf
->scratch
;
139 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
141 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
142 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
144 a
= _mm_insert_epi32 (_mm_setzero_si128(), val
, 0);
146 s32
= (uint32_t *) src
;
147 d32
= (uint32_t *) dest
;
150 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
151 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
152 result
= _mm_clmulepi64_si128 (a
, b
, 0);
153 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
154 result
= _mm_xor_si128 (result
, w
);
155 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
156 result
= _mm_xor_si128 (result
, w
);
157 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
158 result
= _mm_xor_si128 (result
, w
);
159 d32
[i
] ^= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
162 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
163 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
164 result
= _mm_clmulepi64_si128 (a
, b
, 0);
165 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
166 result
= _mm_xor_si128 (result
, w
);
167 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
168 result
= _mm_xor_si128 (result
, w
);
169 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
170 result
= _mm_xor_si128 (result
, w
);
171 d32
[i
] = ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
177 #if defined(INTEL_SSE4_PCLMUL)
180 gf_w32_clm_multiply_region_from_single_4(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
190 gf_internal_t
* h
= gf
->scratch
;
192 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
194 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
195 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
197 a
= _mm_insert_epi32 (_mm_setzero_si128(), val
, 0);
199 s32
= (uint32_t *) src
;
200 d32
= (uint32_t *) dest
;
203 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
204 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
205 result
= _mm_clmulepi64_si128 (a
, b
, 0);
206 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
207 result
= _mm_xor_si128 (result
, w
);
208 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
209 result
= _mm_xor_si128 (result
, w
);
210 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
211 result
= _mm_xor_si128 (result
, w
);
212 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
213 result
= _mm_xor_si128 (result
, w
);
214 d32
[i
] ^= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
217 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
218 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
219 result
= _mm_clmulepi64_si128 (a
, b
, 0);
220 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
221 result
= _mm_xor_si128 (result
, w
);
222 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
223 result
= _mm_xor_si128 (result
, w
);
224 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
225 result
= _mm_xor_si128 (result
, w
);
226 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
227 result
= _mm_xor_si128 (result
, w
);
228 d32
[i
] = ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
236 uint32_t gf_w32_euclid (gf_t
*gf
, uint32_t b
)
238 uint32_t e_i
, e_im1
, e_ip1
;
239 uint32_t d_i
, d_im1
, d_ip1
;
240 uint32_t y_i
, y_im1
, y_ip1
;
243 if (b
== 0) return -1;
244 e_im1
= ((gf_internal_t
*) (gf
->scratch
))->prim_poly
;
247 for (d_i
= d_im1
-1; ((1 << d_i
) & e_i
) == 0; d_i
--) ;
257 while (d_ip1
>= d_i
) {
258 c_i
^= (1 << (d_ip1
- d_i
));
259 e_ip1
^= (e_i
<< (d_ip1
- d_i
));
261 if (e_ip1
== 0) return 0;
262 while ((e_ip1
& (1 << d_ip1
)) == 0) d_ip1
--;
265 y_ip1
= y_im1
^ gf
->multiply
.w32(gf
, c_i
, y_i
);
279 gf_val_32_t
gf_w32_extract_word(gf_t
*gf
, void *start
, int bytes
, int index
)
283 r32
= (uint32_t *) start
;
289 gf_val_32_t
gf_w32_composite_extract_word(gf_t
*gf
, void *start
, int bytes
, int index
)
297 h
= (gf_internal_t
*) gf
->scratch
;
298 gf_set_region_data(&rd
, gf
, start
, start
, bytes
, 0, 0, 32);
299 r32
= (uint32_t *) start
;
300 if (r32
+ index
< (uint32_t *) rd
.d_start
) return r32
[index
];
301 if (r32
+ index
>= (uint32_t *) rd
.d_top
) return r32
[index
];
302 index
-= (((uint32_t *) rd
.d_start
) - r32
);
303 r8
= (uint8_t *) rd
.d_start
;
304 top
= (uint8_t *) rd
.d_top
;
305 sub_size
= (top
-r8
)/2;
307 a
= h
->base_gf
->extract_word
.w32(h
->base_gf
, r8
, sub_size
, index
);
308 b
= h
->base_gf
->extract_word
.w32(h
->base_gf
, r8
+sub_size
, sub_size
, index
);
309 return (a
| (b
<< 16));
313 gf_val_32_t
gf_w32_split_extract_word(gf_t
*gf
, void *start
, int bytes
, int index
)
320 gf_set_region_data(&rd
, gf
, start
, start
, bytes
, 0, 0, 64);
321 r32
= (uint32_t *) start
;
322 if (r32
+ index
< (uint32_t *) rd
.d_start
) return r32
[index
];
323 if (r32
+ index
>= (uint32_t *) rd
.d_top
) return r32
[index
];
324 index
-= (((uint32_t *) rd
.d_start
) - r32
);
325 r8
= (uint8_t *) rd
.d_start
;
326 r8
+= ((index
& 0xfffffff0)*4);
330 for (i
= 0; i
< 4; i
++) {
341 uint32_t gf_w32_matrix (gf_t
*gf
, uint32_t b
)
343 return gf_bitmatrix_inverse(b
, 32, ((gf_internal_t
*) (gf
->scratch
))->prim_poly
);
346 /* JSP: GF_MULT_SHIFT: The world's dumbest multiplication algorithm. I only
347 include it for completeness. It does have the feature that it requires no
351 #if defined(INTEL_SSE4_PCLMUL)
356 gf_w32_cfmgk_multiply (gf_t
*gf
, gf_val_32_t a32
, gf_val_32_t b32
)
364 gf_internal_t
* h
= gf
->scratch
;
365 uint64_t g_star
, q_plus
;
367 q_plus
= *(uint64_t *) h
->private;
368 g_star
= *((uint64_t *) h
->private + 1);
370 a
= _mm_insert_epi32 (_mm_setzero_si128(), a32
, 0);
371 b
= _mm_insert_epi32 (a
, b32
, 0);
372 g
= _mm_insert_epi64 (a
, g_star
, 0);
373 q
= _mm_insert_epi64 (a
, q_plus
, 0);
375 result
= _mm_clmulepi64_si128 (a
, b
, 0);
376 w
= _mm_clmulepi64_si128 (q
, _mm_srli_si128 (result
, 4), 0);
377 w
= _mm_clmulepi64_si128 (g
, _mm_srli_si128 (w
, 4), 0);
378 result
= _mm_xor_si128 (result
, w
);
380 /* Extracts 32 bit value from result. */
381 rv
= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
386 #if defined(INTEL_SSE4_PCLMUL)
390 gf_w32_cfmgk_multiply_region_from_single(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
401 gf_internal_t
* h
= gf
->scratch
;
402 uint64_t g_star
, q_plus
;
404 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
405 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
407 q_plus
= *(uint64_t *) h
->private;
408 g_star
= *((uint64_t *) h
->private + 1);
410 a
= _mm_insert_epi32 (_mm_setzero_si128(), val
, 0);
411 g
= _mm_insert_epi64 (a
, g_star
, 0);
412 q
= _mm_insert_epi64 (a
, q_plus
, 0);
413 s32
= (uint32_t *) src
;
414 d32
= (uint32_t *) dest
;
417 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
418 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
419 result
= _mm_clmulepi64_si128 (a
, b
, 0);
420 w
= _mm_clmulepi64_si128 (q
, _mm_srli_si128 (result
, 4), 0);
421 w
= _mm_clmulepi64_si128 (g
, _mm_srli_si128 (w
, 4), 0);
422 result
= _mm_xor_si128 (result
, w
);
423 d32
[i
] ^= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
426 for (i
= 0; i
< bytes
/sizeof(uint32_t); i
++) {
427 b
= _mm_insert_epi32 (a
, s32
[i
], 0);
428 result
= _mm_clmulepi64_si128 (a
, b
, 0);
429 w
= _mm_clmulepi64_si128 (q
, _mm_srli_si128 (result
, 4), 0);
430 w
= _mm_clmulepi64_si128 (g
, _mm_srli_si128 (w
, 4), 0);
431 result
= _mm_xor_si128 (result
, w
);
432 d32
[i
] = ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
439 #if defined(INTEL_SSE4_PCLMUL)
444 gf_w32_clm_multiply_2 (gf_t
*gf
, gf_val_32_t a32
, gf_val_32_t b32
)
452 gf_internal_t
* h
= gf
->scratch
;
455 a
= _mm_insert_epi32 (_mm_setzero_si128(), a32
, 0);
456 b
= _mm_insert_epi32 (a
, b32
, 0);
458 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
460 /* Do the initial multiply */
462 result
= _mm_clmulepi64_si128 (a
, b
, 0);
464 /* Ben: Do prim_poly reduction twice. We are guaranteed that we will only
465 have to do the reduction at most twice, because (w-2)/z == 2. Where
466 z is equal to the number of zeros after the leading 1
468 _mm_clmulepi64_si128 is the carryless multiply operation. Here
469 _mm_srli_si128 shifts the result to the right by 4 bytes. This allows
470 us to multiply the prim_poly by the leading bits of the result. We
471 then xor the result of that operation back with the result.*/
473 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
474 result
= _mm_xor_si128 (result
, w
);
475 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
476 result
= _mm_xor_si128 (result
, w
);
478 /* Extracts 32 bit value from result. */
479 rv
= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
484 #if defined(INTEL_SSE4_PCLMUL)
489 gf_w32_clm_multiply_3 (gf_t
*gf
, gf_val_32_t a32
, gf_val_32_t b32
)
497 gf_internal_t
* h
= gf
->scratch
;
500 a
= _mm_insert_epi32 (_mm_setzero_si128(), a32
, 0);
501 b
= _mm_insert_epi32 (a
, b32
, 0);
503 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
505 /* Do the initial multiply */
507 result
= _mm_clmulepi64_si128 (a
, b
, 0);
509 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
510 result
= _mm_xor_si128 (result
, w
);
511 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
512 result
= _mm_xor_si128 (result
, w
);
513 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
514 result
= _mm_xor_si128 (result
, w
);
516 /* Extracts 32 bit value from result. */
518 rv
= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
523 #if defined(INTEL_SSE4_PCLMUL)
528 gf_w32_clm_multiply_4 (gf_t
*gf
, gf_val_32_t a32
, gf_val_32_t b32
)
536 gf_internal_t
* h
= gf
->scratch
;
539 a
= _mm_insert_epi32 (_mm_setzero_si128(), a32
, 0);
540 b
= _mm_insert_epi32 (a
, b32
, 0);
542 prim_poly
= _mm_set_epi32(0, 0, 1, (uint32_t)(h
->prim_poly
& 0xffffffffULL
));
544 /* Do the initial multiply */
546 result
= _mm_clmulepi64_si128 (a
, b
, 0);
548 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
549 result
= _mm_xor_si128 (result
, w
);
550 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
551 result
= _mm_xor_si128 (result
, w
);
552 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
553 result
= _mm_xor_si128 (result
, w
);
554 w
= _mm_clmulepi64_si128 (prim_poly
, _mm_srli_si128 (result
, 4), 0);
555 result
= _mm_xor_si128 (result
, w
);
557 /* Extracts 32 bit value from result. */
559 rv
= ((gf_val_32_t
)_mm_extract_epi32(result
, 0));
568 gf_w32_shift_multiply (gf_t
*gf
, uint32_t a32
, uint32_t b32
)
570 uint64_t product
, i
, pp
, a
, b
, one
;
575 h
= (gf_internal_t
*) gf
->scratch
;
577 pp
= h
->prim_poly
| (one
<< 32);
581 for (i
= 0; i
< GF_FIELD_WIDTH
; i
++) {
582 if (a
& (one
<< i
)) product
^= (b
<< i
);
584 for (i
= (GF_FIELD_WIDTH
*2-2); i
>= GF_FIELD_WIDTH
; i
--) {
585 if (product
& (one
<< i
)) product
^= (pp
<< (i
-GF_FIELD_WIDTH
));
591 int gf_w32_cfmgk_init(gf_t
*gf
)
593 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
594 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_multiply_region_from_single
)
596 #if defined(INTEL_SSE4_PCLMUL)
597 if (gf_cpu_supports_intel_pclmul
) {
600 h
= (gf_internal_t
*) gf
->scratch
;
601 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_cfmgk_multiply
)
602 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_cfmgk_multiply_region_from_single
)
604 uint64_t *q_plus
= (uint64_t *) h
->private;
605 uint64_t *g_star
= (uint64_t *) h
->private + 1;
607 uint64_t tmp
= h
->prim_poly
<< 32;
608 *q_plus
= 1ULL << 32;
611 for(i
= 63; i
>= 32; i
--)
612 if((1ULL << i
) & tmp
)
614 *q_plus
|= 1ULL << (i
-32);
615 tmp
^= h
->prim_poly
<< (i
-32);
618 *g_star
= h
->prim_poly
& ((1ULL << 32) - 1);
628 int gf_w32_cfm_init(gf_t
*gf
)
630 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
631 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_multiply_region_from_single
)
633 /*Ben: We also check to see if the prim poly will work for pclmul */
634 /*Ben: Check to see how many reduction steps it will take*/
636 #if defined(INTEL_SSE4_PCLMUL)
637 if (gf_cpu_supports_intel_pclmul
) {
640 h
= (gf_internal_t
*) gf
->scratch
;
642 if ((0xfffe0000 & h
->prim_poly
) == 0){
643 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_2
)
644 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_clm_multiply_region_from_single_2
)
645 }else if ((0xffc00000 & h
->prim_poly
) == 0){
646 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_3
)
647 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_clm_multiply_region_from_single_3
)
648 }else if ((0xfe000000 & h
->prim_poly
) == 0){
649 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_4
)
650 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_clm_multiply_region_from_single_4
)
662 int gf_w32_shift_init(gf_t
*gf
)
664 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
665 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_multiply_region_from_single
)
666 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_shift_multiply
)
672 gf_w32_group_set_shift_tables(uint32_t *shift
, uint32_t val
, gf_internal_t
*h
)
679 for (i
= 1; i
< ((uint32_t)1 << h
->arg1
); i
<<= 1) {
680 for (j
= 0; j
< i
; j
++) shift
[i
|j
] = shift
[j
]^val
;
681 if (val
& GF_FIRST_BIT
) {
691 void gf_w32_group_s_equals_r_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
694 uint32_t p
, l
, ind
, a32
;
698 uint32_t *s32
, *d32
, *top
;
699 struct gf_w32_group_data
*gd
;
700 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
702 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
703 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
705 gd
= (struct gf_w32_group_data
*) h
->private;
707 gf_w32_group_set_shift_tables(gd
->shift
, val
, h
);
709 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
710 gf_do_initial_region_alignment(&rd
);
712 s32
= (uint32_t *) rd
.s_start
;
713 d32
= (uint32_t *) rd
.d_start
;
714 top
= (uint32_t *) rd
.d_top
;
717 if (leftover
== 0) leftover
= g_s
;
729 while (bits_left
> 0) {
734 p
= (gd
->shift
[ind
] ^ gd
->reduce
[l
] ^ (p
<< g_s
));
741 gf_do_final_region_alignment(&rd
);
745 void gf_w32_group_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
747 uint32_t *s32
, *d32
, *top
;
753 struct gf_w32_group_data
*gd
;
756 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
757 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
759 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
762 gd
= (struct gf_w32_group_data
*) h
->private;
763 gf_w32_group_set_shift_tables(gd
->shift
, val
, h
);
765 leftover
= GF_FIELD_WIDTH
% g_s
;
766 if (leftover
== 0) leftover
= g_s
;
768 gd
= (struct gf_w32_group_data
*) h
->private;
769 gf_w32_group_set_shift_tables(gd
->shift
, val
, h
);
771 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
772 gf_do_initial_region_alignment(&rd
);
774 s32
= (uint32_t *) rd
.s_start
;
775 d32
= (uint32_t *) rd
.d_start
;
776 top
= (uint32_t *) rd
.d_top
;
780 ind
= a32
>> (GF_FIELD_WIDTH
- leftover
);
785 i
= (GF_FIELD_WIDTH
- leftover
);
787 ind
= a32
>> (GF_FIELD_WIDTH
-g_s
);
794 ind
= a32
>> (GF_FIELD_WIDTH
-g_s
);
797 for (i
= gd
->tshift
; i
>= 0; i
-= g_r
) {
798 l
= p
& (gd
->rmask
<< i
);
799 r
= gd
->reduce
[l
>> (i
+32)];
809 gf_do_final_region_alignment(&rd
);
815 gf_w32_group_s_equals_r_multiply(gf_t
*gf
, gf_val_32_t a
, gf_val_32_t b
)
818 uint32_t p
, l
, ind
, a32
;
822 struct gf_w32_group_data
*gd
;
823 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
826 gd
= (struct gf_w32_group_data
*) h
->private;
827 gf_w32_group_set_shift_tables(gd
->shift
, b
, h
);
830 if (leftover
== 0) leftover
= g_s
;
841 while (bits_left
> 0) {
846 p
= (gd
->shift
[ind
] ^ gd
->reduce
[l
] ^ (p
<< g_s
));
854 gf_w32_group_4_4_multiply(gf_t
*gf
, gf_val_32_t a
, gf_val_32_t b
)
856 uint32_t p
, l
, ind
, a32
;
858 struct gf_w32_group_data
*d44
;
859 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
861 d44
= (struct gf_w32_group_data
*) h
->private;
862 gf_w32_group_set_shift_tables(d44
->shift
, b
, h
);
871 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
875 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
879 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
883 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
887 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
891 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
894 p
= (d44
->shift
[ind
] ^ d44
->reduce
[l
] ^ (p
<< 4));
901 gf_w32_group_multiply(gf_t
*gf
, gf_val_32_t a
, gf_val_32_t b
)
908 struct gf_w32_group_data
*gd
;
910 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
913 gd
= (struct gf_w32_group_data
*) h
->private;
914 gf_w32_group_set_shift_tables(gd
->shift
, b
, h
);
916 leftover
= GF_FIELD_WIDTH
% g_s
;
917 if (leftover
== 0) leftover
= g_s
;
920 ind
= a32
>> (GF_FIELD_WIDTH
- leftover
);
925 i
= (GF_FIELD_WIDTH
- leftover
);
927 ind
= a32
>> (GF_FIELD_WIDTH
-g_s
);
934 ind
= a32
>> (GF_FIELD_WIDTH
-g_s
);
937 for (i
= gd
->tshift
; i
>= 0; i
-= g_r
) {
938 l
= p
& (gd
->rmask
<< i
);
939 r
= gd
->reduce
[l
>> (i
+32)];
949 gf_w32_bytwo_b_multiply (gf_t
*gf
, gf_val_32_t a
, gf_val_32_t b
)
951 uint32_t prod
, pp
, bmask
;
954 h
= (gf_internal_t
*) gf
->scratch
;
961 if (a
& 1) prod
^= b
;
963 if (a
== 0) return prod
;
975 gf_w32_bytwo_p_multiply (gf_t
*gf
, gf_val_32_t a
, gf_val_32_t b
)
977 uint32_t prod
, pp
, pmask
, amask
;
980 h
= (gf_internal_t
*) gf
->scratch
;
990 prod
= ((prod
<< 1) ^ pp
);
994 if (a
& amask
) prod
^= b
;
1002 gf_w32_bytwo_p_nosse_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
1004 uint64_t *s64
, *d64
, t1
, t2
, ta
, prod
, amask
;
1006 struct gf_w32_bytwo_data
*btd
;
1008 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1009 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1011 btd
= (struct gf_w32_bytwo_data
*) ((gf_internal_t
*) (gf
->scratch
))->private;
1013 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 8);
1014 gf_do_initial_region_alignment(&rd
);
1016 s64
= (uint64_t *) rd
.s_start
;
1017 d64
= (uint64_t *) rd
.d_start
;
1020 while (s64
< (uint64_t *) rd
.s_top
) {
1024 while (amask
!= 0) {
1025 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, prod
, t1
, t2
);
1026 if (val
& amask
) prod
^= ta
;
1034 while (s64
< (uint64_t *) rd
.s_top
) {
1038 while (amask
!= 0) {
1039 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, prod
, t1
, t2
);
1040 if (val
& amask
) prod
^= ta
;
1048 gf_do_final_region_alignment(&rd
);
1051 #define BYTWO_P_ONESTEP {\
1052 SSE_AB2(pp, m1 ,m2, prod, t1, t2); \
1053 t1 = _mm_and_si128(v, one); \
1054 t1 = _mm_sub_epi32(t1, one); \
1055 t1 = _mm_and_si128(t1, ta); \
1056 prod = _mm_xor_si128(prod, t1); \
1057 v = _mm_srli_epi64(v, 1); }
1062 gf_w32_bytwo_p_sse_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
1067 __m128i pp
, m1
, m2
, ta
, prod
, t1
, t2
, tp
, one
, v
;
1068 struct gf_w32_bytwo_data
*btd
;
1071 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1072 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1074 btd
= (struct gf_w32_bytwo_data
*) ((gf_internal_t
*) (gf
->scratch
))->private;
1076 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 16);
1077 gf_do_initial_region_alignment(&rd
);
1080 for (i
= 0; i
< 32; i
++) {
1082 if (!(val
& ((gf_val_32_t
)1 << i
))) vrev
|= 1;
1085 s8
= (uint8_t *) rd
.s_start
;
1086 d8
= (uint8_t *) rd
.d_start
;
1088 pp
= _mm_set1_epi32(btd
->prim_poly
&0xffffffff);
1089 m1
= _mm_set1_epi32((btd
->mask1
)&0xffffffff);
1090 m2
= _mm_set1_epi32((btd
->mask2
)&0xffffffff);
1091 one
= _mm_set1_epi32(1);
1093 while (d8
< (uint8_t *) rd
.d_top
) {
1094 prod
= _mm_setzero_si128();
1095 v
= _mm_set1_epi32(vrev
);
1096 ta
= _mm_load_si128((__m128i
*) s8
);
1097 tp
= (!xor) ? _mm_setzero_si128() : _mm_load_si128((__m128i
*) d8
);
1098 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1099 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1100 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1101 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1102 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1103 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1104 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1105 BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
; BYTWO_P_ONESTEP
;
1106 _mm_store_si128((__m128i
*) d8
, _mm_xor_si128(prod
, tp
));
1110 gf_do_final_region_alignment(&rd
);
1116 gf_w32_bytwo_b_nosse_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
1118 uint64_t *s64
, *d64
, t1
, t2
, ta
, tb
, prod
;
1119 struct gf_w32_bytwo_data
*btd
;
1122 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1123 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1125 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 32);
1126 gf_do_initial_region_alignment(&rd
);
1128 btd
= (struct gf_w32_bytwo_data
*) ((gf_internal_t
*) (gf
->scratch
))->private;
1129 s64
= (uint64_t *) rd
.s_start
;
1130 d64
= (uint64_t *) rd
.d_start
;
1135 while (d64
< (uint64_t *) rd
.d_top
) {
1137 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1143 while (d64
< (uint64_t *) rd
.d_top
) {
1145 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1154 while (d64
< (uint64_t *) rd
.d_top
) {
1157 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1158 *d64
^= (ta
^ prod
);
1163 while (d64
< (uint64_t *) rd
.d_top
) {
1166 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1175 while (d64
< (uint64_t *) rd
.d_top
) {
1177 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1178 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1184 while (d64
< (uint64_t *) rd
.d_top
) {
1186 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1187 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1196 while (d64
< (uint64_t *) rd
.d_top
) {
1199 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1200 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1201 *d64
^= (ta
^ prod
);
1206 while (d64
< (uint64_t *) rd
.d_top
) {
1209 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1210 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1219 while (d64
< (uint64_t *) rd
.d_top
) {
1224 if (tb
& 1) prod
^= ta
;
1227 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1234 while (d64
< (uint64_t *) rd
.d_top
) {
1239 if (tb
& 1) prod
^= ta
;
1242 AB2(btd
->prim_poly
, btd
->mask1
, btd
->mask2
, ta
, t1
, t2
);
1251 gf_do_final_region_alignment(&rd
);
1257 gf_w32_bytwo_b_sse_region_2_noxor(gf_region_data
*rd
, struct gf_w32_bytwo_data
*btd
)
1260 __m128i pp
, m1
, m2
, t1
, t2
, va
;
1262 s8
= (uint8_t *) rd
->s_start
;
1263 d8
= (uint8_t *) rd
->d_start
;
1265 pp
= _mm_set1_epi32(btd
->prim_poly
&0xffffffff);
1266 m1
= _mm_set1_epi32((btd
->mask1
)&0xffffffff);
1267 m2
= _mm_set1_epi32((btd
->mask2
)&0xffffffff);
1269 while (d8
< (uint8_t *) rd
->d_top
) {
1270 va
= _mm_load_si128 ((__m128i
*)(s8
));
1271 SSE_AB2(pp
, m1
, m2
, va
, t1
, t2
);
1272 _mm_store_si128((__m128i
*)d8
, va
);
1282 gf_w32_bytwo_b_sse_region_2_xor(gf_region_data
*rd
, struct gf_w32_bytwo_data
*btd
)
1285 __m128i pp
, m1
, m2
, t1
, t2
, va
, vb
;
1287 s8
= (uint8_t *) rd
->s_start
;
1288 d8
= (uint8_t *) rd
->d_start
;
1290 pp
= _mm_set1_epi32(btd
->prim_poly
&0xffffffff);
1291 m1
= _mm_set1_epi32((btd
->mask1
)&0xffffffff);
1292 m2
= _mm_set1_epi32((btd
->mask2
)&0xffffffff);
1294 while (d8
< (uint8_t *) rd
->d_top
) {
1295 va
= _mm_load_si128 ((__m128i
*)(s8
));
1296 SSE_AB2(pp
, m1
, m2
, va
, t1
, t2
);
1297 vb
= _mm_load_si128 ((__m128i
*)(d8
));
1298 vb
= _mm_xor_si128(vb
, va
);
1299 _mm_store_si128((__m128i
*)d8
, vb
);
1310 gf_w32_bytwo_b_sse_multiply_region(gf_t
*gf
, void *src
, void *dest
, gf_val_32_t val
, int bytes
, int xor)
1314 __m128i pp
, m1
, m2
, t1
, t2
, va
, vb
;
1315 struct gf_w32_bytwo_data
*btd
;
1318 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1319 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1321 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 16);
1322 gf_do_initial_region_alignment(&rd
);
1324 btd
= (struct gf_w32_bytwo_data
*) ((gf_internal_t
*) (gf
->scratch
))->private;
1328 gf_w32_bytwo_b_sse_region_2_xor(&rd
, btd
);
1330 gf_w32_bytwo_b_sse_region_2_noxor(&rd
, btd
);
1332 gf_do_final_region_alignment(&rd
);
1336 s8
= (uint8_t *) rd
.s_start
;
1337 d8
= (uint8_t *) rd
.d_start
;
1339 pp
= _mm_set1_epi32(btd
->prim_poly
&0xffffffff);
1340 m1
= _mm_set1_epi32((btd
->mask1
)&0xffffffff);
1341 m2
= _mm_set1_epi32((btd
->mask2
)&0xffffffff);
1343 while (d8
< (uint8_t *) rd
.d_top
) {
1344 va
= _mm_load_si128 ((__m128i
*)(s8
));
1345 vb
= (!xor) ? _mm_setzero_si128() : _mm_load_si128 ((__m128i
*)(d8
));
1348 if (itb
& 1) vb
= _mm_xor_si128(vb
, va
);
1350 if (itb
== 0) break;
1351 SSE_AB2(pp
, m1
, m2
, va
, t1
, t2
);
1353 _mm_store_si128((__m128i
*)d8
, vb
);
1358 gf_do_final_region_alignment(&rd
);
1363 int gf_w32_bytwo_init(gf_t
*gf
)
1366 uint64_t ip
, m1
, m2
;
1367 struct gf_w32_bytwo_data
*btd
;
1369 h
= (gf_internal_t
*) gf
->scratch
;
1370 btd
= (struct gf_w32_bytwo_data
*) (h
->private);
1371 ip
= h
->prim_poly
& 0xffffffff;
1379 btd
->prim_poly
|= ip
;
1382 ip
<<= GF_FIELD_WIDTH
;
1383 m1
<<= GF_FIELD_WIDTH
;
1384 m2
<<= GF_FIELD_WIDTH
;
1387 if (h
->mult_type
== GF_MULT_BYTWO_p
) {
1388 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_bytwo_p_multiply
)
1390 if (gf_cpu_supports_intel_sse2
&& !(h
->region_type
& GF_REGION_NOSIMD
)) {
1391 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_bytwo_p_sse_multiply_region
)
1394 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_bytwo_p_nosse_multiply_region
)
1395 if(h
->region_type
& GF_REGION_SIMD
)
1401 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_bytwo_b_multiply
)
1403 if (gf_cpu_supports_intel_sse2
&& !(h
->region_type
& GF_REGION_NOSIMD
)) {
1404 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_bytwo_b_sse_multiply_region
)
1407 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_bytwo_b_nosse_multiply_region
)
1408 if(h
->region_type
& GF_REGION_SIMD
)
1415 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
1422 gf_w32_split_8_8_multiply (gf_t
*gf
, uint32_t a32
, uint32_t b32
)
1424 uint32_t product
, i
, j
, mask
, tb
;
1426 struct gf_w32_split_8_8_data
*d8
;
1428 h
= (gf_internal_t
*) gf
->scratch
;
1429 d8
= (struct gf_w32_split_8_8_data
*) h
->private;
1433 for (i
= 0; i
< 4; i
++) {
1435 for (j
= 0; j
< 4; j
++) {
1436 product
^= d8
->tables
[i
+j
][a32
&mask
][tb
&mask
];
1447 gf_w32_split_8_32_lazy_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1450 uint32_t *s32
, *d32
, *top
, p
, a
, v
;
1451 struct gf_split_8_32_lazy_data
*d8
;
1452 struct gf_w32_split_8_8_data
*d88
;
1454 int i
, j
, k
, change
;
1458 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1459 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1461 h
= (gf_internal_t
*) gf
->scratch
;
1462 if (h
->arg1
== 32 || h
->arg2
== 32 || h
->mult_type
== GF_MULT_DEFAULT
) {
1463 d8
= (struct gf_split_8_32_lazy_data
*) h
->private;
1464 for (i
= 0; i
< 4; i
++) t
[i
] = d8
->tables
[i
];
1465 change
= (val
!= d8
->last_value
);
1466 if (change
) d8
->last_value
= val
;
1468 d88
= (struct gf_w32_split_8_8_data
*) h
->private;
1469 for (i
= 0; i
< 4; i
++) t
[i
] = d88
->region_tables
[i
];
1470 change
= (val
!= d88
->last_value
);
1471 if (change
) d88
->last_value
= val
;
1475 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
1476 gf_do_initial_region_alignment(&rd
);
1478 s32
= (uint32_t *) rd
.s_start
;
1479 d32
= (uint32_t *) rd
.d_start
;
1480 top
= (uint32_t *) rd
.d_top
;
1484 for (i
= 0; i
< 4; i
++) {
1486 for (j
= 1; j
< 256; j
<<= 1) {
1487 for (k
= 0; k
< j
; k
++) {
1488 t
[i
][k
^j
] = (v
^ t
[i
][k
]);
1490 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
1496 p
= (xor) ? *d32
: 0;
1509 gf_do_final_region_alignment(&rd
);
1515 gf_w32_split_16_32_lazy_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1518 uint32_t *s32
, *d32
, *top
, p
, a
, v
;
1519 struct gf_split_16_32_lazy_data
*d16
;
1521 int i
, j
, k
, change
;
1525 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1526 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1528 h
= (gf_internal_t
*) gf
->scratch
;
1529 d16
= (struct gf_split_16_32_lazy_data
*) h
->private;
1530 for (i
= 0; i
< 2; i
++) t
[i
] = d16
->tables
[i
];
1531 change
= (val
!= d16
->last_value
);
1532 if (change
) d16
->last_value
= val
;
1536 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
1537 gf_do_initial_region_alignment(&rd
);
1539 s32
= (uint32_t *) rd
.s_start
;
1540 d32
= (uint32_t *) rd
.d_start
;
1541 top
= (uint32_t *) rd
.d_top
;
1545 for (i
= 0; i
< 2; i
++) {
1547 for (j
= 1; j
< (1 << 16); j
<<= 1) {
1548 for (k
= 0; k
< j
; k
++) {
1549 t
[i
][k
^j
] = (v
^ t
[i
][k
]);
1551 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
1557 p
= (xor) ? *d32
: 0;
1560 while (a
!= 0 && i
< 2) {
1570 gf_do_final_region_alignment(&rd
);
1575 gf_w32_split_2_32_lazy_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1578 struct gf_split_2_32_lazy_data
*ld
;
1580 uint32_t pp
, v
, v2
, s
, *s32
, *d32
, *top
;
1583 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1584 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1586 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
1587 gf_do_initial_region_alignment(&rd
);
1589 h
= (gf_internal_t
*) gf
->scratch
;
1592 ld
= (struct gf_split_2_32_lazy_data
*) h
->private;
1594 if (ld
->last_value
!= val
) {
1596 for (i
= 0; i
< 16; i
++) {
1598 if (v
& GF_FIRST_BIT
) v2
^= pp
;
1599 ld
->tables
[i
][0] = 0;
1600 ld
->tables
[i
][1] = v
;
1601 ld
->tables
[i
][2] = v2
;
1602 ld
->tables
[i
][3] = (v2
^ v
);
1604 if (v2
& GF_FIRST_BIT
) v
^= pp
;
1607 ld
->last_value
= val
;
1609 s32
= (uint32_t *) rd
.s_start
;
1610 d32
= (uint32_t *) rd
.d_start
;
1611 top
= (uint32_t *) rd
.d_top
;
1613 while (d32
!= top
) {
1614 v
= (xor) ? *d32
: 0;
1618 v
^= ld
->tables
[i
][s
&3];
1626 gf_do_final_region_alignment(&rd
);
1632 gf_w32_split_2_32_lazy_sse_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1636 uint32_t pp
, v
, v2
, *s32
, *d32
, *top
;
1637 __m128i vi
, si
, pi
, shuffler
, tables
[16], adder
, xi
, mask1
, mask2
;
1640 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1641 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1643 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 32);
1644 gf_do_initial_region_alignment(&rd
);
1646 h
= (gf_internal_t
*) gf
->scratch
;
1649 s32
= (uint32_t *) rd
.s_start
;
1650 d32
= (uint32_t *) rd
.d_start
;
1651 top
= (uint32_t *) rd
.d_top
;
1654 for (i
= 0; i
< 16; i
++) {
1656 if (v
& GF_FIRST_BIT
) v2
^= pp
;
1657 tables
[i
] = _mm_set_epi32(v2
^ v
, v2
, v
, 0);
1659 if (v2
& GF_FIRST_BIT
) v
^= pp
;
1662 shuffler
= _mm_set_epi8(0xc, 0xc, 0xc, 0xc, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
1663 adder
= _mm_set_epi8(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
1664 mask1
= _mm_set1_epi8(0x3);
1665 mask2
= _mm_set1_epi8(0xc);
1667 while (d32
!= top
) {
1668 pi
= (xor) ? _mm_load_si128 ((__m128i
*) d32
) : _mm_setzero_si128();
1669 vi
= _mm_load_si128((__m128i
*) s32
);
1672 for (i
= 0; i
< 4; i
++) {
1673 si
= _mm_shuffle_epi8(vi
, shuffler
);
1675 xi
= _mm_and_si128(si
, mask1
);
1676 xi
= _mm_slli_epi16(xi
, 2);
1677 xi
= _mm_xor_si128(xi
, adder
);
1678 pi
= _mm_xor_si128(pi
, _mm_shuffle_epi8(tables
[tindex
], xi
));
1681 xi
= _mm_and_si128(si
, mask2
);
1682 xi
= _mm_xor_si128(xi
, adder
);
1683 pi
= _mm_xor_si128(pi
, _mm_shuffle_epi8(tables
[tindex
], xi
));
1684 si
= _mm_srli_epi16(si
, 2);
1687 xi
= _mm_and_si128(si
, mask2
);
1688 xi
= _mm_xor_si128(xi
, adder
);
1689 pi
= _mm_xor_si128(pi
, _mm_shuffle_epi8(tables
[tindex
], xi
));
1690 si
= _mm_srli_epi16(si
, 2);
1693 xi
= _mm_and_si128(si
, mask2
);
1694 xi
= _mm_xor_si128(xi
, adder
);
1695 pi
= _mm_xor_si128(pi
, _mm_shuffle_epi8(tables
[tindex
], xi
));
1698 vi
= _mm_srli_epi32(vi
, 8);
1700 _mm_store_si128((__m128i
*) d32
, pi
);
1705 gf_do_final_region_alignment(&rd
);
1712 gf_w32_split_4_32_lazy_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1715 struct gf_split_4_32_lazy_data
*ld
;
1717 uint32_t pp
, v
, s
, *s32
, *d32
, *top
;
1720 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1721 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1723 h
= (gf_internal_t
*) gf
->scratch
;
1726 ld
= (struct gf_split_4_32_lazy_data
*) h
->private;
1728 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
1729 gf_do_initial_region_alignment(&rd
);
1731 if (ld
->last_value
!= val
) {
1733 for (i
= 0; i
< 8; i
++) {
1734 ld
->tables
[i
][0] = 0;
1735 for (j
= 1; j
< 16; j
<<= 1) {
1736 for (k
= 0; k
< j
; k
++) {
1737 ld
->tables
[i
][k
^j
] = (v
^ ld
->tables
[i
][k
]);
1739 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
1743 ld
->last_value
= val
;
1745 s32
= (uint32_t *) rd
.s_start
;
1746 d32
= (uint32_t *) rd
.d_start
;
1747 top
= (uint32_t *) rd
.d_top
;
1749 while (d32
!= top
) {
1750 v
= (xor) ? *d32
: 0;
1754 v
^= ld
->tables
[i
][s
&0xf];
1762 gf_do_final_region_alignment(&rd
);
1768 gf_w32_split_4_32_lazy_sse_altmap_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1772 uint32_t pp
, v
, *s32
, *d32
, *top
;
1773 __m128i si
, tables
[8][4], p0
, p1
, p2
, p3
, mask1
, v0
, v1
, v2
, v3
;
1774 struct gf_split_4_32_lazy_data
*ld
;
1778 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1779 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1781 h
= (gf_internal_t
*) gf
->scratch
;
1784 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 64);
1785 gf_do_initial_region_alignment(&rd
);
1787 s32
= (uint32_t *) rd
.s_start
;
1788 d32
= (uint32_t *) rd
.d_start
;
1789 top
= (uint32_t *) rd
.d_top
;
1791 ld
= (struct gf_split_4_32_lazy_data
*) h
->private;
1794 for (i
= 0; i
< 8; i
++) {
1795 ld
->tables
[i
][0] = 0;
1796 for (j
= 1; j
< 16; j
<<= 1) {
1797 for (k
= 0; k
< j
; k
++) {
1798 ld
->tables
[i
][k
^j
] = (v
^ ld
->tables
[i
][k
]);
1800 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
1802 for (j
= 0; j
< 4; j
++) {
1803 for (k
= 0; k
< 16; k
++) {
1804 btable
[k
] = (uint8_t) ld
->tables
[i
][k
];
1805 ld
->tables
[i
][k
] >>= 8;
1807 tables
[i
][j
] = _mm_loadu_si128((__m128i
*) btable
);
1811 mask1
= _mm_set1_epi8(0xf);
1814 while (d32
!= top
) {
1815 p0
= _mm_load_si128 ((__m128i
*) d32
);
1816 p1
= _mm_load_si128 ((__m128i
*) (d32
+4));
1817 p2
= _mm_load_si128 ((__m128i
*) (d32
+8));
1818 p3
= _mm_load_si128 ((__m128i
*) (d32
+12));
1820 v0
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1821 v1
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1822 v2
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1823 v3
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1825 si
= _mm_and_si128(v0
, mask1
);
1826 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[0][0], si
));
1827 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[0][1], si
));
1828 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[0][2], si
));
1829 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[0][3], si
));
1831 v0
= _mm_srli_epi32(v0
, 4);
1832 si
= _mm_and_si128(v0
, mask1
);
1833 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[1][0], si
));
1834 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[1][1], si
));
1835 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[1][2], si
));
1836 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[1][3], si
));
1838 si
= _mm_and_si128(v1
, mask1
);
1839 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[2][0], si
));
1840 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[2][1], si
));
1841 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[2][2], si
));
1842 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[2][3], si
));
1844 v1
= _mm_srli_epi32(v1
, 4);
1845 si
= _mm_and_si128(v1
, mask1
);
1846 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[3][0], si
));
1847 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[3][1], si
));
1848 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[3][2], si
));
1849 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[3][3], si
));
1851 si
= _mm_and_si128(v2
, mask1
);
1852 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[4][0], si
));
1853 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[4][1], si
));
1854 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[4][2], si
));
1855 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[4][3], si
));
1857 v2
= _mm_srli_epi32(v2
, 4);
1858 si
= _mm_and_si128(v2
, mask1
);
1859 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[5][0], si
));
1860 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[5][1], si
));
1861 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[5][2], si
));
1862 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[5][3], si
));
1864 si
= _mm_and_si128(v3
, mask1
);
1865 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[6][0], si
));
1866 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[6][1], si
));
1867 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[6][2], si
));
1868 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[6][3], si
));
1870 v3
= _mm_srli_epi32(v3
, 4);
1871 si
= _mm_and_si128(v3
, mask1
);
1872 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[7][0], si
));
1873 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[7][1], si
));
1874 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[7][2], si
));
1875 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[7][3], si
));
1877 _mm_store_si128((__m128i
*) d32
, p0
);
1878 _mm_store_si128((__m128i
*) (d32
+4), p1
);
1879 _mm_store_si128((__m128i
*) (d32
+8), p2
);
1880 _mm_store_si128((__m128i
*) (d32
+12), p3
);
1884 while (d32
!= top
) {
1886 v0
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1887 v1
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1888 v2
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1889 v3
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
1891 si
= _mm_and_si128(v0
, mask1
);
1892 p0
= _mm_shuffle_epi8(tables
[0][0], si
);
1893 p1
= _mm_shuffle_epi8(tables
[0][1], si
);
1894 p2
= _mm_shuffle_epi8(tables
[0][2], si
);
1895 p3
= _mm_shuffle_epi8(tables
[0][3], si
);
1897 v0
= _mm_srli_epi32(v0
, 4);
1898 si
= _mm_and_si128(v0
, mask1
);
1899 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[1][0], si
));
1900 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[1][1], si
));
1901 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[1][2], si
));
1902 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[1][3], si
));
1904 si
= _mm_and_si128(v1
, mask1
);
1905 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[2][0], si
));
1906 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[2][1], si
));
1907 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[2][2], si
));
1908 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[2][3], si
));
1910 v1
= _mm_srli_epi32(v1
, 4);
1911 si
= _mm_and_si128(v1
, mask1
);
1912 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[3][0], si
));
1913 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[3][1], si
));
1914 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[3][2], si
));
1915 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[3][3], si
));
1917 si
= _mm_and_si128(v2
, mask1
);
1918 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[4][0], si
));
1919 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[4][1], si
));
1920 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[4][2], si
));
1921 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[4][3], si
));
1923 v2
= _mm_srli_epi32(v2
, 4);
1924 si
= _mm_and_si128(v2
, mask1
);
1925 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[5][0], si
));
1926 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[5][1], si
));
1927 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[5][2], si
));
1928 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[5][3], si
));
1930 si
= _mm_and_si128(v3
, mask1
);
1931 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[6][0], si
));
1932 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[6][1], si
));
1933 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[6][2], si
));
1934 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[6][3], si
));
1936 v3
= _mm_srli_epi32(v3
, 4);
1937 si
= _mm_and_si128(v3
, mask1
);
1938 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[7][0], si
));
1939 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[7][1], si
));
1940 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[7][2], si
));
1941 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[7][3], si
));
1943 _mm_store_si128((__m128i
*) d32
, p0
);
1944 _mm_store_si128((__m128i
*) (d32
+4), p1
);
1945 _mm_store_si128((__m128i
*) (d32
+8), p2
);
1946 _mm_store_si128((__m128i
*) (d32
+12), p3
);
1951 gf_do_final_region_alignment(&rd
);
1959 gf_w32_split_4_32_lazy_sse_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
1963 uint32_t pp
, v
, *s32
, *d32
, *top
, tmp_table
[16];
1964 __m128i si
, tables
[8][4], p0
, p1
, p2
, p3
, mask1
, v0
, v1
, v2
, v3
, mask8
;
1965 __m128i tv1
, tv2
, tv3
, tv0
;
1969 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
1970 if (val
== 1) { gf_multby_one(src
, dest
, bytes
, xor); return; }
1972 h
= (gf_internal_t
*) gf
->scratch
;
1975 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 64);
1976 gf_do_initial_region_alignment(&rd
);
1978 s32
= (uint32_t *) rd
.s_start
;
1979 d32
= (uint32_t *) rd
.d_start
;
1980 top
= (uint32_t *) rd
.d_top
;
1983 for (i
= 0; i
< 8; i
++) {
1985 for (j
= 1; j
< 16; j
<<= 1) {
1986 for (k
= 0; k
< j
; k
++) {
1987 tmp_table
[k
^j
] = (v
^ tmp_table
[k
]);
1989 v
= (v
& GF_FIRST_BIT
) ? ((v
<< 1) ^ pp
) : (v
<< 1);
1991 for (j
= 0; j
< 4; j
++) {
1992 for (k
= 0; k
< 16; k
++) {
1993 btable
[k
] = (uint8_t) tmp_table
[k
];
1996 tables
[i
][j
] = _mm_loadu_si128((__m128i
*) btable
);
2000 mask1
= _mm_set1_epi8(0xf);
2001 mask8
= _mm_set1_epi16(0xff);
2004 while (d32
!= top
) {
2005 v0
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2006 v1
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2007 v2
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2008 v3
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2010 p0
= _mm_srli_epi16(v0
, 8);
2011 p1
= _mm_srli_epi16(v1
, 8);
2012 p2
= _mm_srli_epi16(v2
, 8);
2013 p3
= _mm_srli_epi16(v3
, 8);
2015 tv0
= _mm_and_si128(v0
, mask8
);
2016 tv1
= _mm_and_si128(v1
, mask8
);
2017 tv2
= _mm_and_si128(v2
, mask8
);
2018 tv3
= _mm_and_si128(v3
, mask8
);
2020 v0
= _mm_packus_epi16(p1
, p0
);
2021 v1
= _mm_packus_epi16(tv1
, tv0
);
2022 v2
= _mm_packus_epi16(p3
, p2
);
2023 v3
= _mm_packus_epi16(tv3
, tv2
);
2025 p0
= _mm_srli_epi16(v0
, 8);
2026 p1
= _mm_srli_epi16(v1
, 8);
2027 p2
= _mm_srli_epi16(v2
, 8);
2028 p3
= _mm_srli_epi16(v3
, 8);
2030 tv0
= _mm_and_si128(v0
, mask8
);
2031 tv1
= _mm_and_si128(v1
, mask8
);
2032 tv2
= _mm_and_si128(v2
, mask8
);
2033 tv3
= _mm_and_si128(v3
, mask8
);
2035 v0
= _mm_packus_epi16(p2
, p0
);
2036 v1
= _mm_packus_epi16(p3
, p1
);
2037 v2
= _mm_packus_epi16(tv2
, tv0
);
2038 v3
= _mm_packus_epi16(tv3
, tv1
);
2040 si
= _mm_and_si128(v0
, mask1
);
2041 p0
= _mm_shuffle_epi8(tables
[6][0], si
);
2042 p1
= _mm_shuffle_epi8(tables
[6][1], si
);
2043 p2
= _mm_shuffle_epi8(tables
[6][2], si
);
2044 p3
= _mm_shuffle_epi8(tables
[6][3], si
);
2046 v0
= _mm_srli_epi32(v0
, 4);
2047 si
= _mm_and_si128(v0
, mask1
);
2048 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[7][0], si
));
2049 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[7][1], si
));
2050 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[7][2], si
));
2051 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[7][3], si
));
2053 si
= _mm_and_si128(v1
, mask1
);
2054 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[4][0], si
));
2055 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[4][1], si
));
2056 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[4][2], si
));
2057 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[4][3], si
));
2059 v1
= _mm_srli_epi32(v1
, 4);
2060 si
= _mm_and_si128(v1
, mask1
);
2061 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[5][0], si
));
2062 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[5][1], si
));
2063 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[5][2], si
));
2064 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[5][3], si
));
2066 si
= _mm_and_si128(v2
, mask1
);
2067 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[2][0], si
));
2068 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[2][1], si
));
2069 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[2][2], si
));
2070 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[2][3], si
));
2072 v2
= _mm_srli_epi32(v2
, 4);
2073 si
= _mm_and_si128(v2
, mask1
);
2074 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[3][0], si
));
2075 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[3][1], si
));
2076 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[3][2], si
));
2077 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[3][3], si
));
2079 si
= _mm_and_si128(v3
, mask1
);
2080 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[0][0], si
));
2081 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[0][1], si
));
2082 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[0][2], si
));
2083 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[0][3], si
));
2085 v3
= _mm_srli_epi32(v3
, 4);
2086 si
= _mm_and_si128(v3
, mask1
);
2087 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[1][0], si
));
2088 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[1][1], si
));
2089 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[1][2], si
));
2090 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[1][3], si
));
2092 tv0
= _mm_unpackhi_epi8(p1
, p3
);
2093 tv1
= _mm_unpackhi_epi8(p0
, p2
);
2094 tv2
= _mm_unpacklo_epi8(p1
, p3
);
2095 tv3
= _mm_unpacklo_epi8(p0
, p2
);
2097 p0
= _mm_unpackhi_epi8(tv1
, tv0
);
2098 p1
= _mm_unpacklo_epi8(tv1
, tv0
);
2099 p2
= _mm_unpackhi_epi8(tv3
, tv2
);
2100 p3
= _mm_unpacklo_epi8(tv3
, tv2
);
2102 v0
= _mm_load_si128 ((__m128i
*) d32
);
2103 v1
= _mm_load_si128 ((__m128i
*) (d32
+4));
2104 v2
= _mm_load_si128 ((__m128i
*) (d32
+8));
2105 v3
= _mm_load_si128 ((__m128i
*) (d32
+12));
2107 p0
= _mm_xor_si128(p0
, v0
);
2108 p1
= _mm_xor_si128(p1
, v1
);
2109 p2
= _mm_xor_si128(p2
, v2
);
2110 p3
= _mm_xor_si128(p3
, v3
);
2112 _mm_store_si128((__m128i
*) d32
, p0
);
2113 _mm_store_si128((__m128i
*) (d32
+4), p1
);
2114 _mm_store_si128((__m128i
*) (d32
+8), p2
);
2115 _mm_store_si128((__m128i
*) (d32
+12), p3
);
2119 while (d32
!= top
) {
2120 v0
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2121 v1
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2122 v2
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2123 v3
= _mm_load_si128((__m128i
*) s32
); s32
+= 4;
2125 p0
= _mm_srli_epi16(v0
, 8);
2126 p1
= _mm_srli_epi16(v1
, 8);
2127 p2
= _mm_srli_epi16(v2
, 8);
2128 p3
= _mm_srli_epi16(v3
, 8);
2130 tv0
= _mm_and_si128(v0
, mask8
);
2131 tv1
= _mm_and_si128(v1
, mask8
);
2132 tv2
= _mm_and_si128(v2
, mask8
);
2133 tv3
= _mm_and_si128(v3
, mask8
);
2135 v0
= _mm_packus_epi16(p1
, p0
);
2136 v1
= _mm_packus_epi16(tv1
, tv0
);
2137 v2
= _mm_packus_epi16(p3
, p2
);
2138 v3
= _mm_packus_epi16(tv3
, tv2
);
2140 p0
= _mm_srli_epi16(v0
, 8);
2141 p1
= _mm_srli_epi16(v1
, 8);
2142 p2
= _mm_srli_epi16(v2
, 8);
2143 p3
= _mm_srli_epi16(v3
, 8);
2145 tv0
= _mm_and_si128(v0
, mask8
);
2146 tv1
= _mm_and_si128(v1
, mask8
);
2147 tv2
= _mm_and_si128(v2
, mask8
);
2148 tv3
= _mm_and_si128(v3
, mask8
);
2150 v0
= _mm_packus_epi16(p2
, p0
);
2151 v1
= _mm_packus_epi16(p3
, p1
);
2152 v2
= _mm_packus_epi16(tv2
, tv0
);
2153 v3
= _mm_packus_epi16(tv3
, tv1
);
2155 si
= _mm_and_si128(v0
, mask1
);
2156 p0
= _mm_shuffle_epi8(tables
[6][0], si
);
2157 p1
= _mm_shuffle_epi8(tables
[6][1], si
);
2158 p2
= _mm_shuffle_epi8(tables
[6][2], si
);
2159 p3
= _mm_shuffle_epi8(tables
[6][3], si
);
2161 v0
= _mm_srli_epi32(v0
, 4);
2162 si
= _mm_and_si128(v0
, mask1
);
2163 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[7][0], si
));
2164 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[7][1], si
));
2165 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[7][2], si
));
2166 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[7][3], si
));
2168 si
= _mm_and_si128(v1
, mask1
);
2169 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[4][0], si
));
2170 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[4][1], si
));
2171 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[4][2], si
));
2172 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[4][3], si
));
2174 v1
= _mm_srli_epi32(v1
, 4);
2175 si
= _mm_and_si128(v1
, mask1
);
2176 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[5][0], si
));
2177 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[5][1], si
));
2178 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[5][2], si
));
2179 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[5][3], si
));
2181 si
= _mm_and_si128(v2
, mask1
);
2182 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[2][0], si
));
2183 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[2][1], si
));
2184 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[2][2], si
));
2185 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[2][3], si
));
2187 v2
= _mm_srli_epi32(v2
, 4);
2188 si
= _mm_and_si128(v2
, mask1
);
2189 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[3][0], si
));
2190 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[3][1], si
));
2191 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[3][2], si
));
2192 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[3][3], si
));
2194 si
= _mm_and_si128(v3
, mask1
);
2195 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[0][0], si
));
2196 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[0][1], si
));
2197 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[0][2], si
));
2198 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[0][3], si
));
2200 v3
= _mm_srli_epi32(v3
, 4);
2201 si
= _mm_and_si128(v3
, mask1
);
2202 p0
= _mm_xor_si128(p0
, _mm_shuffle_epi8(tables
[1][0], si
));
2203 p1
= _mm_xor_si128(p1
, _mm_shuffle_epi8(tables
[1][1], si
));
2204 p2
= _mm_xor_si128(p2
, _mm_shuffle_epi8(tables
[1][2], si
));
2205 p3
= _mm_xor_si128(p3
, _mm_shuffle_epi8(tables
[1][3], si
));
2207 tv0
= _mm_unpackhi_epi8(p1
, p3
);
2208 tv1
= _mm_unpackhi_epi8(p0
, p2
);
2209 tv2
= _mm_unpacklo_epi8(p1
, p3
);
2210 tv3
= _mm_unpacklo_epi8(p0
, p2
);
2212 p0
= _mm_unpackhi_epi8(tv1
, tv0
);
2213 p1
= _mm_unpacklo_epi8(tv1
, tv0
);
2214 p2
= _mm_unpackhi_epi8(tv3
, tv2
);
2215 p3
= _mm_unpacklo_epi8(tv3
, tv2
);
2217 _mm_store_si128((__m128i
*) d32
, p0
);
2218 _mm_store_si128((__m128i
*) (d32
+4), p1
);
2219 _mm_store_si128((__m128i
*) (d32
+8), p2
);
2220 _mm_store_si128((__m128i
*) (d32
+12), p3
);
2224 gf_do_final_region_alignment(&rd
);
2229 int gf_w32_split_init(gf_t
*gf
)
2232 struct gf_split_2_32_lazy_data
*ld2
;
2233 struct gf_split_4_32_lazy_data
*ld4
;
2234 struct gf_w32_split_8_8_data
*d8
;
2235 struct gf_split_8_32_lazy_data
*d32
;
2236 struct gf_split_16_32_lazy_data
*d16
;
2240 h
= (gf_internal_t
*) gf
->scratch
;
2244 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
2246 /* JSP: First handle single multiplication:
2247 If args == 8, then we're doing split 8 8.
2248 Otherwise, if PCLMUL, we use that.
2249 Otherwise, we use bytwo_p.
2252 if (h
->arg1
== 8 && h
->arg2
== 8) {
2253 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_split_8_8_multiply
)
2254 #if defined(INTEL_SSE4_PCLMUL)
2255 } else if (gf_cpu_supports_intel_pclmul
) {
2256 if ((0xfffe0000 & h
->prim_poly
) == 0){
2257 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_2
)
2258 } else if ((0xffc00000 & h
->prim_poly
) == 0){
2259 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_3
)
2260 } else if ((0xfe000000 & h
->prim_poly
) == 0){
2261 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_clm_multiply_4
)
2265 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_bytwo_p_multiply
)
2268 /* Easy cases: 16/32 and 2/32 */
2270 if ((h
->arg1
== 16 && h
->arg2
== 32) || (h
->arg1
== 32 && h
->arg2
== 16)) {
2271 d16
= (struct gf_split_16_32_lazy_data
*) h
->private;
2272 d16
->last_value
= 0;
2273 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_16_32_lazy_multiply_region
)
2277 if ((h
->arg1
== 2 && h
->arg2
== 32) || (h
->arg1
== 32 && h
->arg2
== 2)) {
2278 ld2
= (struct gf_split_2_32_lazy_data
*) h
->private;
2279 ld2
->last_value
= 0;
2281 if (gf_cpu_supports_intel_ssse3
&& !(h
->region_type
& GF_REGION_NOSIMD
)) {
2282 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_2_32_lazy_sse_multiply_region
)
2285 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_2_32_lazy_multiply_region
)
2286 if(h
->region_type
& GF_REGION_SIMD
) return 0;
2293 /* 4/32 or Default + SSE - There is no ALTMAP/NOSSE. */
2296 if ((h
->arg1
== 4 && h
->arg2
== 32) || (h
->arg1
== 32 && h
->arg2
== 4) ||
2297 ((gf_cpu_supports_intel_ssse3
|| gf_cpu_supports_arm_neon
) && h
->mult_type
== GF_REGION_DEFAULT
)) {
2298 ld4
= (struct gf_split_4_32_lazy_data
*) h
->private;
2299 ld4
->last_value
= 0;
2300 if ((h
->region_type
& GF_REGION_NOSIMD
) || !(gf_cpu_supports_intel_ssse3
|| gf_cpu_supports_arm_neon
)) {
2301 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_4_32_lazy_multiply_region
)
2302 } else if (gf_cpu_supports_arm_neon
) {
2304 gf_w32_neon_split_init(gf
);
2306 } else if (h
->region_type
& GF_REGION_ALTMAP
) {
2308 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_4_32_lazy_sse_altmap_multiply_region
)
2312 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_4_32_lazy_sse_multiply_region
)
2318 /* 8/32 or Default + no SSE */
2320 if ((h
->arg1
== 8 && h
->arg2
== 32) || (h
->arg1
== 32 && h
->arg2
== 8) ||
2321 h
->mult_type
== GF_MULT_DEFAULT
) {
2322 d32
= (struct gf_split_8_32_lazy_data
*) h
->private;
2323 d32
->last_value
= 0;
2324 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_8_32_lazy_multiply_region
)
2328 /* Finally, if args == 8, then we have to set up the tables here. */
2330 if (h
->arg1
== 8 && h
->arg2
== 8) {
2331 d8
= (struct gf_w32_split_8_8_data
*) h
->private;
2333 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_split_8_8_multiply
)
2334 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_split_8_32_lazy_multiply_region
)
2336 for (exp
= 0; exp
< 7; exp
++) {
2337 for (j
= 0; j
< 256; j
++) d8
->tables
[exp
][0][j
] = 0;
2338 for (i
= 0; i
< 256; i
++) d8
->tables
[exp
][i
][0] = 0;
2339 d8
->tables
[exp
][1][1] = basep
;
2340 for (i
= 2; i
< 256; i
++) {
2342 p
= d8
->tables
[exp
][i
^1][1];
2343 d8
->tables
[exp
][i
][1] = p
^ basep
;
2345 p
= d8
->tables
[exp
][i
>>1][1];
2346 d8
->tables
[exp
][i
][1] = GF_MULTBY_TWO(p
);
2349 for (i
= 1; i
< 256; i
++) {
2350 p
= d8
->tables
[exp
][i
][1];
2351 for (j
= 1; j
< 256; j
++) {
2353 d8
->tables
[exp
][i
][j
] = d8
->tables
[exp
][i
][j
^1] ^ p
;
2355 d8
->tables
[exp
][i
][j
] = GF_MULTBY_TWO(d8
->tables
[exp
][i
][j
>>1]);
2359 for (i
= 0; i
< 8; i
++) basep
= GF_MULTBY_TWO(basep
);
2364 /* If we get here, then the arguments were bad. */
2370 int gf_w32_group_init(gf_t
*gf
)
2372 uint32_t i
, j
, p
, index
;
2373 struct gf_w32_group_data
*gd
;
2374 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2380 gd
= (struct gf_w32_group_data
*) h
->private;
2381 gd
->shift
= (uint32_t *) (&(gd
->memory
));
2382 gd
->reduce
= gd
->shift
+ (1 << g_s
);
2384 gd
->rmask
= (1 << g_r
) - 1;
2387 gd
->tshift
= 32 % g_s
;
2388 if (gd
->tshift
== 0) gd
->tshift
= g_s
;
2389 gd
->tshift
= (32 - gd
->tshift
);
2390 gd
->tshift
= ((gd
->tshift
-1)/g_r
) * g_r
;
2393 for (i
= 0; i
< ((uint32_t)1 << g_r
); i
++) {
2396 for (j
= 0; j
< g_r
; j
++) {
2398 p
^= (h
->prim_poly
<< j
);
2400 index
^= (h
->prim_poly
>> (32-j
));
2403 gd
->reduce
[index
] = p
;
2407 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_group_s_equals_r_multiply
)
2408 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_group_s_equals_r_multiply_region
)
2410 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_group_multiply
)
2411 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_group_multiply_region
)
2413 SET_FUNCTION(gf
,divide
,w32
,NULL
)
2414 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
2422 gf_w32_composite_multiply_recursive(gf_t
*gf
, uint32_t a
, uint32_t b
)
2424 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2425 gf_t
*base_gf
= h
->base_gf
;
2426 uint32_t b0
= b
& 0x0000ffff;
2427 uint32_t b1
= (b
& 0xffff0000) >> 16;
2428 uint32_t a0
= a
& 0x0000ffff;
2429 uint32_t a1
= (a
& 0xffff0000) >> 16;
2432 a1b1
= base_gf
->multiply
.w32(base_gf
, a1
, b1
);
2434 rv
= ((base_gf
->multiply
.w32(base_gf
, a1
, b0
) ^ base_gf
->multiply
.w32(base_gf
, a0
, b1
) ^ base_gf
->multiply
.w32(base_gf
, a1b1
, h
->prim_poly
)) << 16) | (base_gf
->multiply
.w32(base_gf
, a0
, b0
) ^ a1b1
);
2438 /* JSP: This could be made faster. Someday, when I'm bored. */
2442 gf_w32_composite_multiply_inline(gf_t
*gf
, uint32_t a
, uint32_t b
)
2444 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2445 uint32_t b0
= b
& 0x0000ffff;
2446 uint32_t b1
= b
>> 16;
2447 uint32_t a0
= a
& 0x0000ffff;
2448 uint32_t a1
= a
>> 16;
2449 uint32_t a1b1
, prod
;
2450 uint16_t *log
, *alog
;
2451 struct gf_w32_composite_data
*cd
;
2453 cd
= (struct gf_w32_composite_data
*) h
->private;
2457 a1b1
= GF_W16_INLINE_MULT(log
, alog
, a1
, b1
);
2458 prod
= GF_W16_INLINE_MULT(log
, alog
, a1
, b0
);
2459 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b1
);
2460 prod
^= GF_W16_INLINE_MULT(log
, alog
, a1b1
, h
->prim_poly
);
2462 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b0
);
2468 * Composite field division trick (explained in 2007 tech report)
2470 * Compute a / b = a*b^-1, where p(x) = x^2 + sx + 1
2474 * c*b = (s*b1c1+b1c0+b0c1)x+(b1c1+b0c0)
2476 * want (s*b1c1+b1c0+b0c1) = 0 and (b1c1+b0c0) = 1
2478 * let d = b1c1 and d+1 = b0c0
2480 * solve s*b1c1+b1c0+b0c1 = 0
2482 * solution: d = (b1b0^-1)(b1b0^-1+b0b1^-1+s)^-1
2492 gf_w32_composite_inverse(gf_t
*gf
, uint32_t a
)
2494 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2495 gf_t
*base_gf
= h
->base_gf
;
2496 uint16_t a0
= a
& 0x0000ffff;
2497 uint16_t a1
= (a
& 0xffff0000) >> 16;
2498 uint16_t c0
, c1
, d
, tmp
;
2500 uint16_t a0inv
, a1inv
;
2503 a1inv
= base_gf
->inverse
.w32(base_gf
, a1
);
2504 c0
= base_gf
->multiply
.w32(base_gf
, a1inv
, h
->prim_poly
);
2506 } else if (a1
== 0) {
2507 c0
= base_gf
->inverse
.w32(base_gf
, a0
);
2510 a1inv
= base_gf
->inverse
.w32(base_gf
, a1
);
2511 a0inv
= base_gf
->inverse
.w32(base_gf
, a0
);
2513 d
= base_gf
->multiply
.w32(base_gf
, a1
, a0inv
);
2515 tmp
= (base_gf
->multiply
.w32(base_gf
, a1
, a0inv
) ^ base_gf
->multiply
.w32(base_gf
, a0
, a1inv
) ^ h
->prim_poly
);
2516 tmp
= base_gf
->inverse
.w32(base_gf
, tmp
);
2518 d
= base_gf
->multiply
.w32(base_gf
, d
, tmp
);
2520 c0
= base_gf
->multiply
.w32(base_gf
, (d
^1), a0inv
);
2521 c1
= base_gf
->multiply
.w32(base_gf
, d
, a1inv
);
2524 c
= c0
| (c1
<< 16);
2531 gf_w32_composite_multiply_region(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
2533 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2534 gf_t
*base_gf
= h
->base_gf
;
2535 uint32_t b0
= val
& 0x0000ffff;
2536 uint32_t b1
= (val
& 0xffff0000) >> 16;
2537 uint32_t *s32
, *d32
, *top
;
2538 uint16_t a0
, a1
, a1b1
, *log
, *alog
;
2541 struct gf_w32_composite_data
*cd
;
2543 cd
= (struct gf_w32_composite_data
*) h
->private;
2547 if (val
== 0) { gf_multby_zero(dest
, bytes
, xor); return; }
2548 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 4);
2557 a0
= *s32
& 0x0000ffff;
2558 a1
= (*s32
& 0xffff0000) >> 16;
2559 a1b1
= base_gf
->multiply
.w32(base_gf
, a1
, b1
);
2561 *d32
^= ((base_gf
->multiply
.w32(base_gf
, a0
, b0
) ^ a1b1
) |
2562 ((base_gf
->multiply
.w32(base_gf
, a1
, b0
) ^ base_gf
->multiply
.w32(base_gf
, a0
, b1
) ^ base_gf
->multiply
.w32(base_gf
, a1b1
, h
->prim_poly
)) << 16));
2568 a0
= *s32
& 0x0000ffff;
2569 a1
= (*s32
& 0xffff0000) >> 16;
2570 a1b1
= base_gf
->multiply
.w32(base_gf
, a1
, b1
);
2572 *d32
= ((base_gf
->multiply
.w32(base_gf
, a0
, b0
) ^ a1b1
) |
2573 ((base_gf
->multiply
.w32(base_gf
, a1
, b0
) ^ base_gf
->multiply
.w32(base_gf
, a0
, b1
) ^ base_gf
->multiply
.w32(base_gf
, a1b1
, h
->prim_poly
)) << 16));
2581 a0
= *s32
& 0x0000ffff;
2582 a1
= (*s32
& 0xffff0000) >> 16;
2583 a1b1
= GF_W16_INLINE_MULT(log
, alog
, a1
, b1
);
2585 prod
= GF_W16_INLINE_MULT(log
, alog
, a1
, b0
);
2586 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b1
);
2587 prod
^= GF_W16_INLINE_MULT(log
, alog
, a1b1
, h
->prim_poly
);
2589 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b0
);
2597 a0
= *s32
& 0x0000ffff;
2598 a1
= (*s32
& 0xffff0000) >> 16;
2599 a1b1
= GF_W16_INLINE_MULT(log
, alog
, a1
, b1
);
2601 prod
= GF_W16_INLINE_MULT(log
, alog
, a1
, b0
);
2602 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b1
);
2603 prod
^= GF_W16_INLINE_MULT(log
, alog
, a1b1
, h
->prim_poly
);
2605 prod
^= GF_W16_INLINE_MULT(log
, alog
, a0
, b0
);
2618 gf_w32_composite_multiply_region_alt(gf_t
*gf
, void *src
, void *dest
, uint32_t val
, int bytes
, int xor)
2620 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2621 gf_t
*base_gf
= h
->base_gf
;
2622 uint16_t val0
= val
& 0x0000ffff;
2623 uint16_t val1
= (val
& 0xffff0000) >> 16;
2626 uint8_t *slow
, *shigh
;
2627 uint8_t *dlow
, *dhigh
, *top
;
2629 /* JSP: I want the two pointers aligned wrt each other on 16 byte
2630 boundaries. So I'm going to make sure that the area on
2631 which the two operate is a multiple of 32. Of course, that
2632 junks up the mapping, but so be it -- that's why we have extract_word.... */
2634 gf_set_region_data(&rd
, gf
, src
, dest
, bytes
, val
, xor, 32);
2635 gf_do_initial_region_alignment(&rd
);
2637 slow
= (uint8_t *) rd
.s_start
;
2638 dlow
= (uint8_t *) rd
.d_start
;
2639 top
= (uint8_t *) rd
.d_top
;
2640 sub_reg_size
= (top
- dlow
)/2;
2641 shigh
= slow
+ sub_reg_size
;
2642 dhigh
= dlow
+ sub_reg_size
;
2644 base_gf
->multiply_region
.w32(base_gf
, slow
, dlow
, val0
, sub_reg_size
, xor);
2645 base_gf
->multiply_region
.w32(base_gf
, shigh
, dlow
, val1
, sub_reg_size
, 1);
2646 base_gf
->multiply_region
.w32(base_gf
, slow
, dhigh
, val1
, sub_reg_size
, xor);
2647 base_gf
->multiply_region
.w32(base_gf
, shigh
, dhigh
, val0
, sub_reg_size
, 1);
2648 base_gf
->multiply_region
.w32(base_gf
, shigh
, dhigh
, base_gf
->multiply
.w32(base_gf
, h
->prim_poly
, val1
), sub_reg_size
, 1);
2650 gf_do_final_region_alignment(&rd
);
2654 int gf_w32_composite_init(gf_t
*gf
)
2656 gf_internal_t
*h
= (gf_internal_t
*) gf
->scratch
;
2657 struct gf_w32_composite_data
*cd
;
2659 if (h
->base_gf
== NULL
) return 0;
2661 cd
= (struct gf_w32_composite_data
*) h
->private;
2662 cd
->log
= gf_w16_get_log_table(h
->base_gf
);
2663 cd
->alog
= gf_w16_get_mult_alog_table(h
->base_gf
);
2665 if (h
->region_type
& GF_REGION_ALTMAP
) {
2666 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_composite_multiply_region_alt
)
2668 SET_FUNCTION(gf
,multiply_region
,w32
,gf_w32_composite_multiply_region
)
2671 if (cd
->log
== NULL
) {
2672 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_composite_multiply_recursive
)
2674 SET_FUNCTION(gf
,multiply
,w32
,gf_w32_composite_multiply_inline
)
2676 SET_FUNCTION(gf
,divide
,w32
,NULL
)
2677 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_composite_inverse
)
2684 int gf_w32_scratch_size(int mult_type
, int region_type
, int divide_type
, int arg1
, int arg2
)
2688 case GF_MULT_BYTWO_p
:
2689 case GF_MULT_BYTWO_b
:
2690 return sizeof(gf_internal_t
) + sizeof(struct gf_w32_bytwo_data
) + 64;
2693 return sizeof(gf_internal_t
) + sizeof(struct gf_w32_group_data
) +
2694 sizeof(uint32_t) * (1 << arg1
) +
2695 sizeof(uint32_t) * (1 << arg2
) + 64;
2697 case GF_MULT_DEFAULT
:
2699 case GF_MULT_SPLIT_TABLE
:
2700 if (arg1
== 8 && arg2
== 8){
2701 return sizeof(gf_internal_t
) + sizeof(struct gf_w32_split_8_8_data
) + 64;
2703 if ((arg1
== 16 && arg2
== 32) || (arg2
== 16 && arg1
== 32)) {
2704 return sizeof(gf_internal_t
) + sizeof(struct gf_split_16_32_lazy_data
) + 64;
2706 if ((arg1
== 2 && arg2
== 32) || (arg2
== 2 && arg1
== 32)) {
2707 return sizeof(gf_internal_t
) + sizeof(struct gf_split_2_32_lazy_data
) + 64;
2709 if ((arg1
== 8 && arg2
== 32) || (arg2
== 8 && arg1
== 32) ||
2710 (mult_type
== GF_MULT_DEFAULT
&& !(gf_cpu_supports_intel_ssse3
|| gf_cpu_supports_arm_neon
))) {
2711 return sizeof(gf_internal_t
) + sizeof(struct gf_split_8_32_lazy_data
) + 64;
2713 if ((arg1
== 4 && arg2
== 32) ||
2714 (arg2
== 4 && arg1
== 32) ||
2715 mult_type
== GF_MULT_DEFAULT
) {
2716 return sizeof(gf_internal_t
) + sizeof(struct gf_split_4_32_lazy_data
) + 64;
2719 case GF_MULT_CARRY_FREE
:
2720 return sizeof(gf_internal_t
);
2722 case GF_MULT_CARRY_FREE_GK
:
2723 return sizeof(gf_internal_t
) + sizeof(uint64_t)*2;
2726 return sizeof(gf_internal_t
);
2728 case GF_MULT_COMPOSITE
:
2729 return sizeof(gf_internal_t
) + sizeof(struct gf_w32_composite_data
) + 64;
2738 int gf_w32_init(gf_t
*gf
)
2742 h
= (gf_internal_t
*) gf
->scratch
;
2744 /* Allen: set default primitive polynomial / irreducible polynomial if needed */
2746 if (h
->prim_poly
== 0) {
2747 if (h
->mult_type
== GF_MULT_COMPOSITE
) {
2748 h
->prim_poly
= gf_composite_get_default_poly(h
->base_gf
);
2749 if (h
->prim_poly
== 0) return 0; /* This shouldn't happen */
2752 /* Allen: use the following primitive polynomial to make carryless multiply work more efficiently for GF(2^32).*/
2754 /* h->prim_poly = 0xc5; */
2756 /* Allen: The following is the traditional primitive polynomial for GF(2^32) */
2758 h
->prim_poly
= 0x400007;
2762 /* No leading one */
2764 if(h
->mult_type
!= GF_MULT_COMPOSITE
) h
->prim_poly
&= 0xffffffff;
2766 SET_FUNCTION(gf
,multiply
,w32
,NULL
)
2767 SET_FUNCTION(gf
,divide
,w32
,NULL
)
2768 SET_FUNCTION(gf
,inverse
,w32
,NULL
)
2769 SET_FUNCTION(gf
,multiply_region
,w32
,NULL
)
2771 switch(h
->mult_type
) {
2772 case GF_MULT_CARRY_FREE
: if (gf_w32_cfm_init(gf
) == 0) return 0; break;
2773 case GF_MULT_CARRY_FREE_GK
: if (gf_w32_cfmgk_init(gf
) == 0) return 0; break;
2774 case GF_MULT_SHIFT
: if (gf_w32_shift_init(gf
) == 0) return 0; break;
2775 case GF_MULT_COMPOSITE
: if (gf_w32_composite_init(gf
) == 0) return 0; break;
2776 case GF_MULT_DEFAULT
:
2777 case GF_MULT_SPLIT_TABLE
: if (gf_w32_split_init(gf
) == 0) return 0; break;
2778 case GF_MULT_GROUP
: if (gf_w32_group_init(gf
) == 0) return 0; break;
2779 case GF_MULT_BYTWO_p
:
2780 case GF_MULT_BYTWO_b
: if (gf_w32_bytwo_init(gf
) == 0) return 0; break;
2783 if (h
->divide_type
== GF_DIVIDE_EUCLID
) {
2784 SET_FUNCTION(gf
,divide
,w32
,gf_w32_divide_from_inverse
)
2785 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_euclid
)
2786 } else if (h
->divide_type
== GF_DIVIDE_MATRIX
) {
2787 SET_FUNCTION(gf
,divide
,w32
,gf_w32_divide_from_inverse
)
2788 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_matrix
)
2791 if (gf
->inverse
.w32
!= NULL
&& gf
->divide
.w32
== NULL
) {
2792 SET_FUNCTION(gf
,divide
,w32
,gf_w32_divide_from_inverse
)
2794 if (gf
->inverse
.w32
== NULL
&& gf
->divide
.w32
!= NULL
) {
2795 SET_FUNCTION(gf
,inverse
,w32
,gf_w32_inverse_from_divide
)
2797 if (h
->region_type
== GF_REGION_CAUCHY
) {
2798 SET_FUNCTION(gf
,extract_word
,w32
,gf_wgen_extract_word
)
2799 SET_FUNCTION(gf
,multiply_region
,w32
,gf_wgen_cauchy_region
)
2800 } else if (h
->region_type
& GF_REGION_ALTMAP
) {
2801 if (h
->mult_type
== GF_MULT_COMPOSITE
) {
2802 SET_FUNCTION(gf
,extract_word
,w32
,gf_w32_composite_extract_word
)
2804 SET_FUNCTION(gf
,extract_word
,w32
,gf_w32_split_extract_word
)
2807 SET_FUNCTION(gf
,extract_word
,w32
,gf_w32_extract_word
)