]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/jerasure/gf-complete/src/gf.c
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / erasure-code / jerasure / gf-complete / src / gf.c
1 /*
2 * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
3 * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
4 * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
5 *
6 * gf.c
7 *
8 * Generic routines for Galois fields
9 */
10
11 #include "gf_int.h"
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <assert.h>
15 #include "gf_cpu.h"
16
17 int _gf_errno = GF_E_DEFAULT;
18
19 void gf_error()
20 {
21 char *s;
22
23 switch(_gf_errno) {
24 case GF_E_DEFAULT: s = "No Error."; break;
25 case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
26 case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
27 case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
28 case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
29 case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
30 case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
31 case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
32 case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
33 case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
34 case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
35 case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
36 case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
37 case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
38 case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
39 case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
40 case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
41 case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
42 case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
43 case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
44 case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
45 case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
46 case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
47 case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
48 case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
49 case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
50 case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
51 case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
52 case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
53 case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
54 case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
55 case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
56 case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
57 case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
58 case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
59 case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
60 case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
61 case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
62 case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
63 case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
64 case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
65 case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
66 case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
67 case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
68 case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
69 case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
70 case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
71 case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
72 case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
73 case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
74 case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
75 case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
76 case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
77 case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
78 case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
79 case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
80 case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
81 case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
82 case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
83 case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
84 case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
85 case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
86 case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
87 case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
88 case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
89 case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
90 case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
91 case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
92 case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
93 case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
94 case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
95 case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
96 case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
97 case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
98 case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
99 case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
100 case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
101 case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
102 case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
103 case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
104 case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
105 case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
106 case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
107 case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
108 case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
109 case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
110 case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
111 case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
112 case GF_E_UNK_REG: s = "Unknown region type."; break;
113 case GF_E_UNK_DIV: s = "Unknown division type."; break;
114 default: s = "Undefined error.";
115 }
116
117 fprintf(stderr, "%s\n", s);
118 }
119
120 uint64_t gf_composite_get_default_poly(gf_t *base)
121 {
122 gf_internal_t *h;
123 uint64_t rv;
124
125 h = (gf_internal_t *) base->scratch;
126 if (h->w == 4) {
127 if (h->mult_type == GF_MULT_COMPOSITE) return 0;
128 if (h->prim_poly == 0x13) return 2;
129 return 0;
130 }
131 if (h->w == 8) {
132 if (h->mult_type == GF_MULT_COMPOSITE) return 0;
133 if (h->prim_poly == 0x11d) return 3;
134 return 0;
135 }
136 if (h->w == 16) {
137 if (h->mult_type == GF_MULT_COMPOSITE) {
138 rv = gf_composite_get_default_poly(h->base_gf);
139 if (rv != h->prim_poly) return 0;
140 if (rv == 3) return 0x105;
141 return 0;
142 } else {
143 if (h->prim_poly == 0x1100b) return 2;
144 if (h->prim_poly == 0x1002d) return 7;
145 return 0;
146 }
147 }
148 if (h->w == 32) {
149 if (h->mult_type == GF_MULT_COMPOSITE) {
150 rv = gf_composite_get_default_poly(h->base_gf);
151 if (rv != h->prim_poly) return 0;
152 if (rv == 2) return 0x10005;
153 if (rv == 7) return 0x10008;
154 if (rv == 0x105) return 0x10002;
155 return 0;
156 } else {
157 if (h->prim_poly == 0x400007) return 2;
158 if (h->prim_poly == 0xc5) return 3;
159 return 0;
160 }
161 }
162 if (h->w == 64) {
163 if (h->mult_type == GF_MULT_COMPOSITE) {
164 rv = gf_composite_get_default_poly(h->base_gf);
165 if (rv != h->prim_poly) return 0;
166 if (rv == 3) return 0x100000009ULL;
167 if (rv == 2) return 0x100000004ULL;
168 if (rv == 0x10005) return 0x100000003ULL;
169 if (rv == 0x10002) return 0x100000005ULL;
170 if (rv == 0x10008) return 0x100000006ULL; /* JSP: (0x0x100000003 works too,
171 but I want to differentiate cases). */
172 return 0;
173 } else {
174 if (h->prim_poly == 0x1bULL) return 2;
175 return 0;
176 }
177 }
178 return 0;
179 }
180
181 int gf_error_check(int w, int mult_type, int region_type, int divide_type,
182 int arg1, int arg2, uint64_t poly, gf_t *base)
183 {
184 int sse3 = 0;
185 int sse2 = 0;
186 int pclmul = 0;
187 int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
188 gf_internal_t *sub;
189
190 rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
191 rquad = (region_type & GF_REGION_QUAD_TABLE);
192 rlazy = (region_type & GF_REGION_LAZY);
193 rsimd = (region_type & GF_REGION_SIMD);
194 rnosimd = (region_type & GF_REGION_NOSIMD);
195 raltmap = (region_type & GF_REGION_ALTMAP);
196 rcauchy = (region_type & GF_REGION_CAUCHY);
197
198 if (divide_type != GF_DIVIDE_DEFAULT &&
199 divide_type != GF_DIVIDE_MATRIX &&
200 divide_type != GF_DIVIDE_EUCLID) {
201 _gf_errno = GF_E_UNK_DIV;
202 return 0;
203 }
204
205 tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
206 GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
207 GF_REGION_CAUCHY );
208 if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
209
210 #ifdef INTEL_SSE2
211 if (gf_cpu_supports_intel_sse2) {
212 sse2 = 1;
213 }
214 #endif
215
216 #ifdef INTEL_SSSE3
217 if (gf_cpu_supports_intel_ssse3) {
218 sse3 = 1;
219 }
220 #endif
221
222 #ifdef INTEL_SSE4_PCLMUL
223 if (gf_cpu_supports_intel_pclmul) {
224 pclmul = 1;
225 }
226 #endif
227
228 #ifdef ARM_NEON
229 if (gf_cpu_supports_arm_neon) {
230 pclmul = (w == 4 || w == 8);
231 sse3 = 1;
232 }
233 #endif
234
235
236 if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
237
238 if (mult_type != GF_MULT_COMPOSITE && w < 64) {
239 if ((poly >> (w+1)) != 0) { _gf_errno = GF_E_BADPOLY; return 0; }
240 }
241
242 if (mult_type == GF_MULT_DEFAULT) {
243 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
244 if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
245 if (arg1 != 0 || arg2 != 0) { _gf_errno = GF_E_MDEFARG; return 0; }
246 return 1;
247 }
248
249 if (rsimd && rnosimd) { _gf_errno = GF_E_SIMD_NO; return 0; }
250 if (rcauchy && w > 32) { _gf_errno = GF_E_CAUGT32; return 0; }
251 if (rcauchy && region_type != GF_REGION_CAUCHY) { _gf_errno = GF_E_CAUCHYB; return 0; }
252 if (rcauchy && mult_type == GF_MULT_COMPOSITE) { _gf_errno = GF_E_CAUCOMP; return 0; }
253
254 if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
255 mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
256 _gf_errno = GF_E_ARG1SET;
257 return 0;
258 }
259
260 if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
261 _gf_errno = GF_E_ARG2SET;
262 return 0;
263 }
264
265 if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
266
267 if (rdouble) {
268 if (rquad) { _gf_errno = GF_E_DOUQUAD; return 0; }
269 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
270 if (w != 4 && w != 8) { _gf_errno = GF_E_DOUBLEW; return 0; }
271 if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
272 if (rlazy && w == 4) { _gf_errno = GF_E_DOUBLEL; return 0; }
273 return 1;
274 }
275
276 if (rquad) {
277 if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
278 if (w != 4) { _gf_errno = GF_E_QUAD__W; return 0; }
279 if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
280 return 1;
281 }
282
283 if (rlazy) { _gf_errno = GF_E_LAZY__X; return 0; }
284
285 if (mult_type == GF_MULT_SHIFT) {
286 if (raltmap) { _gf_errno = GF_E_ALTSHIF; return 0; }
287 if (rsimd || rnosimd) { _gf_errno = GF_E_SSESHIF; return 0; }
288 return 1;
289 }
290
291 if (mult_type == GF_MULT_CARRY_FREE) {
292 if (w != 4 && w != 8 && w != 16 &&
293 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
294 if (w == 4 && (poly & 0xc)) { _gf_errno = GF_E_CFM4POL; return 0; }
295 if (w == 8 && (poly & 0x80)) { _gf_errno = GF_E_CFM8POL; return 0; }
296 if (w == 16 && (poly & 0xe000)) { _gf_errno = GF_E_CF16POL; return 0; }
297 if (w == 32 && (poly & 0xfe000000)) { _gf_errno = GF_E_CF32POL; return 0; }
298 if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
299 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
300 if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
301 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
302 return 1;
303 }
304
305 if (mult_type == GF_MULT_CARRY_FREE_GK) {
306 if (w != 4 && w != 8 && w != 16 &&
307 w != 32 && w != 64 && w != 128) { _gf_errno = GF_E_CFM___W; return 0; }
308 if (raltmap) { _gf_errno = GF_E_ALT_CFM; return 0; }
309 if (rsimd || rnosimd) { _gf_errno = GF_E_SSE_CFM; return 0; }
310 if (!pclmul) { _gf_errno = GF_E_PCLMULX; return 0; }
311 return 1;
312 }
313
314 if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
315 if (raltmap) { _gf_errno = GF_E_ALT_BY2; return 0; }
316 if (rsimd && !sse2) { _gf_errno = GF_E_BY2_SSE; return 0; }
317 return 1;
318 }
319
320 if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
321 || mult_type == GF_MULT_LOG_ZERO_EXT ) {
322 if (w > 27) { _gf_errno = GF_E_LOGBADW; return 0; }
323 if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
324
325 if (mult_type == GF_MULT_LOG_TABLE) return 1;
326
327 if (w != 8 && w != 16) { _gf_errno = GF_E_ZERBADW; return 0; }
328
329 if (mult_type == GF_MULT_LOG_ZERO) return 1;
330
331 if (w != 8) { _gf_errno = GF_E_ZEXBADW; return 0; }
332 return 1;
333 }
334
335 if (mult_type == GF_MULT_GROUP) {
336 if (arg1 <= 0 || arg2 <= 0) { _gf_errno = GF_E_GR_ARGX; return 0; }
337 if (w == 4 || w == 8) { _gf_errno = GF_E_GR_W_48; return 0; }
338 if (w == 16 && (arg1 != 4 || arg2 != 4)) { _gf_errno = GF_E_GR_W_16; return 0; }
339 if (w == 128 && (arg1 != 4 ||
340 (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
341 if (arg1 > 27 || arg2 > 27) { _gf_errno = GF_E_GR_A_27; return 0; }
342 if (arg1 > w || arg2 > w) { _gf_errno = GF_E_GR_AR_W; return 0; }
343 if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_GR____J; return 0; }
344 return 1;
345 }
346
347 if (mult_type == GF_MULT_TABLE) {
348 if (w != 16 && w >= 15) { _gf_errno = GF_E_TABLE_W; return 0; }
349 if (w != 4 && (rsimd || rnosimd)) { _gf_errno = GF_E_TAB_SSE; return 0; }
350 if (rsimd && !sse3) { _gf_errno = GF_E_TABSSE3; return 0; }
351 if (raltmap) { _gf_errno = GF_E_TAB_ALT; return 0; }
352 return 1;
353 }
354
355 if (mult_type == GF_MULT_SPLIT_TABLE) {
356 if (arg1 > arg2) {
357 tmp = arg1;
358 arg1 = arg2;
359 arg2 = tmp;
360 }
361 if (w == 8) {
362 if (arg1 != 4 || arg2 != 8) { _gf_errno = GF_E_SP_8_AR; return 0; }
363 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
364 if (raltmap) { _gf_errno = GF_E_SP_8__A; return 0; }
365 } else if (w == 16) {
366 if ((arg1 == 8 && arg2 == 8) ||
367 (arg1 == 8 && arg2 == 16)) {
368 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_16_S; return 0; }
369 if (raltmap) { _gf_errno = GF_E_SP_16_A; return 0; }
370 } else if (arg1 == 4 && arg2 == 16) {
371 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
372 } else { _gf_errno = GF_E_SP_16AR; return 0; }
373 } else if (w == 32) {
374 if ((arg1 == 8 && arg2 == 8) ||
375 (arg1 == 8 && arg2 == 32) ||
376 (arg1 == 16 && arg2 == 32)) {
377 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_32_S; return 0; }
378 if (raltmap) { _gf_errno = GF_E_SP_32_A; return 0; }
379 } else if (arg1 == 4 && arg2 == 32) {
380 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
381 if (raltmap && !sse3) { _gf_errno = GF_E_SP_32AS; return 0; }
382 if (raltmap && rnosimd) { _gf_errno = GF_E_SP_32AS; return 0; }
383 } else { _gf_errno = GF_E_SP_32AR; return 0; }
384 } else if (w == 64) {
385 if ((arg1 == 8 && arg2 == 8) ||
386 (arg1 == 8 && arg2 == 64) ||
387 (arg1 == 16 && arg2 == 64)) {
388 if (rsimd || rnosimd) { _gf_errno = GF_E_SP_64_S; return 0; }
389 if (raltmap) { _gf_errno = GF_E_SP_64_A; return 0; }
390 } else if (arg1 == 4 && arg2 == 64) {
391 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
392 if (raltmap && !sse3) { _gf_errno = GF_E_SP_64AS; return 0; }
393 if (raltmap && rnosimd) { _gf_errno = GF_E_SP_64AS; return 0; }
394 } else { _gf_errno = GF_E_SP_64AR; return 0; }
395 } else if (w == 128) {
396 if (arg1 == 8 && arg2 == 128) {
397 if (rsimd || rnosimd) { _gf_errno = GF_E_SP128_S; return 0; }
398 if (raltmap) { _gf_errno = GF_E_SP128_A; return 0; }
399 } else if (arg1 == 4 && arg2 == 128) {
400 if (rsimd && !sse3) { _gf_errno = GF_E_SP_SSE3; return 0; }
401 if (raltmap && !sse3) { _gf_errno = GF_E_SP128AS; return 0; }
402 if (raltmap && rnosimd) { _gf_errno = GF_E_SP128AS; return 0; }
403 } else { _gf_errno = GF_E_SP128AR; return 0; }
404 } else { _gf_errno = GF_E_SPLIT_W; return 0; }
405 return 1;
406 }
407
408 if (mult_type == GF_MULT_COMPOSITE) {
409 if (w != 8 && w != 16 && w != 32
410 && w != 64 && w != 128) { _gf_errno = GF_E_COMP__W; return 0; }
411 if (w < 128 && (poly >> (w/2)) != 0) { _gf_errno = GF_E_COMP_PP; return 0; }
412 if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_DIVCOMP; return 0; }
413 if (arg1 != 2) { _gf_errno = GF_E_COMP_A2; return 0; }
414 if (rsimd || rnosimd) { _gf_errno = GF_E_COMP_SS; return 0; }
415 if (base != NULL) {
416 sub = (gf_internal_t *) base->scratch;
417 if (sub->w != w/2) { _gf_errno = GF_E_BASE__W; return 0; }
418 if (poly == 0) {
419 if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
420 }
421 }
422 return 1;
423 }
424
425 _gf_errno = GF_E_UNKNOWN;
426 return 0;
427 }
428
429 int gf_scratch_size(int w,
430 int mult_type,
431 int region_type,
432 int divide_type,
433 int arg1,
434 int arg2)
435 {
436 if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
437
438 switch(w) {
439 case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
440 case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
441 case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
442 case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
443 case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
444 case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
445 default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
446 }
447 }
448
449 extern int gf_size(gf_t *gf)
450 {
451 gf_internal_t *h;
452 int s;
453
454 s = sizeof(gf_t);
455 h = (gf_internal_t *) gf->scratch;
456 s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
457 if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
458 return s;
459 }
460
461
462 int gf_init_easy(gf_t *gf, int w)
463 {
464 return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
465 0, 0, 0, NULL, NULL);
466 }
467
468 /* Allen: What's going on here is this function is putting info into the
469 scratch mem of gf, and then calling the relevant REAL init
470 func for the word size. Probably done this way to consolidate
471 those aspects of initialization that don't rely on word size,
472 and then take care of word-size-specific stuff. */
473
474 int gf_init_hard(gf_t *gf, int w, int mult_type,
475 int region_type,
476 int divide_type,
477 uint64_t prim_poly,
478 int arg1, int arg2,
479 gf_t *base_gf,
480 void *scratch_memory)
481 {
482 int sz;
483 gf_internal_t *h;
484
485 gf_cpu_identify();
486
487 if (gf_error_check(w, mult_type, region_type, divide_type,
488 arg1, arg2, prim_poly, base_gf) == 0) return 0;
489
490 sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
491 if (sz <= 0) return 0; /* This shouldn't happen, as all errors should get caught
492 in gf_error_check() */
493
494 if (scratch_memory == NULL) {
495 h = (gf_internal_t *) malloc(sz);
496 h->free_me = 1;
497 } else {
498 h = scratch_memory;
499 h->free_me = 0;
500 }
501 gf->scratch = (void *) h;
502 h->mult_type = mult_type;
503 h->region_type = region_type;
504 h->divide_type = divide_type;
505 h->w = w;
506 h->prim_poly = prim_poly;
507 h->arg1 = arg1;
508 h->arg2 = arg2;
509 h->base_gf = base_gf;
510 h->private = (void *) gf->scratch;
511 h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
512 gf->extract_word.w32 = NULL;
513
514 switch(w) {
515 case 4: return gf_w4_init(gf);
516 case 8: return gf_w8_init(gf);
517 case 16: return gf_w16_init(gf);
518 case 32: return gf_w32_init(gf);
519 case 64: return gf_w64_init(gf);
520 case 128: return gf_w128_init(gf);
521 default: return gf_wgen_init(gf);
522 }
523 }
524
525 int gf_free(gf_t *gf, int recursive)
526 {
527 gf_internal_t *h;
528
529 h = (gf_internal_t *) gf->scratch;
530 if (recursive && h->base_gf != NULL) {
531 gf_free(h->base_gf, 1);
532 free(h->base_gf);
533 }
534 if (h->free_me) free(h);
535 return 0; /* Making compiler happy */
536 }
537
538 void gf_alignment_error(char *s, int a)
539 {
540 fprintf(stderr, "Alignment error in %s:\n", s);
541 fprintf(stderr, " The source and destination buffers must be aligned to each other,\n");
542 fprintf(stderr, " and they must be aligned to a %d-byte address.\n", a);
543 assert(0);
544 }
545
546 static
547 void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
548 int cols, i, j;
549 uint32_t tmp;
550
551 cols = rows;
552
553 for (i = 0; i < rows; i++) inv[i] = (1 << i);
554
555 /* First -- convert into upper triangular */
556
557 for (i = 0; i < cols; i++) {
558
559 /* Swap rows if we ave a zero i,i element. If we can't swap, then the
560 matrix was not invertible */
561
562 if ((mat[i] & (1 << i)) == 0) {
563 for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
564 if (j == rows) {
565 fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
566 assert(0);
567 }
568 tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
569 tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
570 }
571
572 /* Now for each j>i, add A_ji*Ai to Aj */
573 for (j = i+1; j != rows; j++) {
574 if ((mat[j] & (1 << i)) != 0) {
575 mat[j] ^= mat[i];
576 inv[j] ^= inv[i];
577 }
578 }
579 }
580
581 /* Now the matrix is upper triangular. Start at the top and multiply down */
582
583 for (i = rows-1; i >= 0; i--) {
584 for (j = 0; j < i; j++) {
585 if (mat[j] & (1 << i)) {
586 /* mat[j] ^= mat[i]; */
587 inv[j] ^= inv[i];
588 }
589 }
590 }
591 }
592
593 uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
594 {
595 uint32_t mat[32], inv[32], mask;
596 int i;
597
598 mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
599 for (i = 0; i < w; i++) {
600 mat[i] = y;
601
602 if (y & (1 << (w-1))) {
603 y = y << 1;
604 y = ((y ^ pp) & mask);
605 } else {
606 y = y << 1;
607 }
608 }
609
610 gf_invert_binary_matrix(mat, inv, w);
611 return inv[0];
612 }
613
614 void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
615 {
616 uint64_t a, prod;
617 int xor;
618 uint64_t *s64, *d64, *top;
619
620 s64 = rd->s_start;
621 d64 = rd->d_start;
622 top = rd->d_top;
623 xor = rd->xor;
624
625 if (xor) {
626 while (d64 != top) {
627 a = *s64;
628 prod = base[a >> 48];
629 a <<= 16;
630 prod <<= 16;
631 prod ^= base[a >> 48];
632 a <<= 16;
633 prod <<= 16;
634 prod ^= base[a >> 48];
635 a <<= 16;
636 prod <<= 16;
637 prod ^= base[a >> 48];
638 prod ^= *d64;
639 *d64 = prod;
640 s64++;
641 d64++;
642 }
643 } else {
644 while (d64 != top) {
645 a = *s64;
646 prod = base[a >> 48];
647 a <<= 16;
648 prod <<= 16;
649 prod ^= base[a >> 48];
650 a <<= 16;
651 prod <<= 16;
652 prod ^= base[a >> 48];
653 a <<= 16;
654 prod <<= 16;
655 prod ^= base[a >> 48];
656 *d64 = prod;
657 s64++;
658 d64++;
659 }
660 }
661 }
662
663 static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
664 {
665 uint8_t *s8, *d8;
666 uint16_t *s16, *d16;
667 uint32_t *s32, *d32;
668 uint64_t *s64, *d64;
669 gf_internal_t *h;
670 int wb;
671 uint32_t p, a;
672
673 h = rd->gf->scratch;
674 wb = (h->w)/8;
675 if (wb == 0) wb = 1;
676
677 while (src < s_top) {
678 switch (h->w) {
679 case 8:
680 s8 = (uint8_t *) src;
681 d8 = (uint8_t *) dest;
682 *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
683 rd->gf->multiply.w32(rd->gf, rd->val, *s8);
684 break;
685 case 4:
686 s8 = (uint8_t *) src;
687 d8 = (uint8_t *) dest;
688 a = *s8;
689 p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
690 p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
691 if (rd->xor) p ^= *d8;
692 *d8 = p;
693 break;
694 case 16:
695 s16 = (uint16_t *) src;
696 d16 = (uint16_t *) dest;
697 *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
698 rd->gf->multiply.w32(rd->gf, rd->val, *s16);
699 break;
700 case 32:
701 s32 = (uint32_t *) src;
702 d32 = (uint32_t *) dest;
703 *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
704 rd->gf->multiply.w32(rd->gf, rd->val, *s32);
705 break;
706 case 64:
707 s64 = (uint64_t *) src;
708 d64 = (uint64_t *) dest;
709 *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
710 rd->gf->multiply.w64(rd->gf, rd->val, *s64);
711 break;
712 default:
713 fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
714 exit(1);
715 }
716 src = (uint8_t *)src + wb;
717 dest = (uint8_t *)dest + wb;
718 }
719 }
720
721 /* JSP - The purpose of this procedure is to error check alignment,
722 and to set up the region operation so that it can best leverage
723 large words.
724
725 It stores its information in rd.
726
727 Assuming you're not doing Cauchy coding, (see below for that),
728 then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
729 should change that).
730
731 src and dest must then be aligned on ceil(w/8)-byte boundaries.
732 Moreover, bytes must be a multiple of ceil(w/8). If the variable
733 align is equal to ceil(w/8), then we will set s_start = src,
734 d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
735 And we return -- the implementation will go ahead and do the
736 multiplication on individual words (e.g. using discrete logs).
737
738 If align is greater than ceil(w/8), then the implementation needs
739 to work on groups of "align" bytes. For example, suppose you are
740 implementing BYTWO, without SSE. Then you will be doing the region
741 multiplication in units of 8 bytes, so align = 8. Or, suppose you
742 are doing a Quad table in GF(2^4). You will be doing the region
743 multiplication in units of 2 bytes, so align = 2. Or, suppose you
744 are doing split multiplication with SSE operations in GF(2^8).
745 Then align = 16. Worse yet, suppose you are doing split
746 multiplication with SSE operations in GF(2^16), with or without
747 ALTMAP. Then, you will be doing the multiplication on 256 bits at
748 a time. So align = 32.
749
750 When align does not equal ceil(w/8), we split the region
751 multiplication into three parts. We are going to make s_start be
752 the first address greater than or equal to src that is a multiple
753 of align. s_top is going to be the largest address >= src+bytes
754 such that (s_top - s_start) is a multiple of align. We do the
755 same with d_start and d_top. When we say that "src and dest must
756 be aligned with respect to each other, we mean that s_start-src
757 must equal d_start-dest.
758
759 Now, the region multiplication is done in three parts -- the part
760 between src and s_start must be done using single words.
761 Similarly, the part between s_top and src+bytes must also be done
762 using single words. The part between s_start and s_top will be
763 done in chunks of "align" bytes.
764
765 One final thing -- if align > 16, then s_start and d_start will be
766 aligned on a 16 byte boundary. Perhaps we should have two
767 variables: align and chunksize. Then we'd have s_start & d_start
768 aligned to "align", and have s_top-s_start be a multiple of
769 chunksize. That may be less confusing, but it would be a big
770 change.
771
772 Finally, if align = -1, then we are doing Cauchy multiplication,
773 using only XOR's. In this case, we're not going to care about
774 alignment because we are just doing XOR's. Instead, the only
775 thing we care about is that bytes must be a multiple of w.
776
777 This is not to say that alignment doesn't matter in performance
778 with XOR's. See that discussion in gf_multby_one().
779
780 After you call gf_set_region_data(), the procedure
781 gf_do_initial_region_alignment() calls gf->multiply.w32() on
782 everything between src and s_start. The procedure
783 gf_do_final_region_alignment() calls gf->multiply.w32() on
784 everything between s_top and src+bytes.
785 */
786
787 void gf_set_region_data(gf_region_data *rd,
788 gf_t *gf,
789 void *src,
790 void *dest,
791 int bytes,
792 uint64_t val,
793 int xor,
794 int align)
795 {
796 gf_internal_t *h = NULL;
797 int wb;
798 uint32_t a;
799 unsigned long uls, uld;
800
801 if (gf == NULL) { /* JSP - Can be NULL if you're just doing XOR's */
802 wb = 1;
803 } else {
804 h = gf->scratch;
805 wb = (h->w)/8;
806 if (wb == 0) wb = 1;
807 }
808
809 rd->gf = gf;
810 rd->src = src;
811 rd->dest = dest;
812 rd->bytes = bytes;
813 rd->val = val;
814 rd->xor = xor;
815 rd->align = align;
816
817 uls = (unsigned long) src;
818 uld = (unsigned long) dest;
819
820 a = (align <= 16) ? align : 16;
821
822 if (align == -1) { /* JSP: This is cauchy. Error check bytes, then set up the pointers
823 so that there are no alignment regions. */
824 if (h != NULL && bytes % h->w != 0) {
825 fprintf(stderr, "Error in region multiply operation.\n");
826 fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
827 assert(0);
828 }
829
830 rd->s_start = src;
831 rd->d_start = dest;
832 rd->s_top = (uint8_t *)src + bytes;
833 rd->d_top = (uint8_t *)src + bytes;
834 return;
835 }
836
837 if (uls % a != uld % a) {
838 fprintf(stderr, "Error in region multiply operation.\n");
839 fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
840 fprintf(stderr, "to each other along a %d byte boundary.\n", a);
841 fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
842 (unsigned long) dest);
843 assert(0);
844 }
845
846 if (uls % wb != 0) {
847 fprintf(stderr, "Error in region multiply operation.\n");
848 fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
849 fprintf(stderr, "Src = 0x%lx. Dest = 0x%lx\n", (unsigned long) src,
850 (unsigned long) dest);
851 assert(0);
852 }
853
854 if (bytes % wb != 0) {
855 fprintf(stderr, "Error in region multiply operation.\n");
856 fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
857 assert(0);
858 }
859
860 uls %= a;
861 if (uls != 0) uls = (a-uls);
862 rd->s_start = (uint8_t *)rd->src + uls;
863 rd->d_start = (uint8_t *)rd->dest + uls;
864 bytes -= uls;
865 bytes -= (bytes % align);
866 rd->s_top = (uint8_t *)rd->s_start + bytes;
867 rd->d_top = (uint8_t *)rd->d_start + bytes;
868
869 }
870
871 void gf_do_initial_region_alignment(gf_region_data *rd)
872 {
873 gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
874 }
875
876 void gf_do_final_region_alignment(gf_region_data *rd)
877 {
878 gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
879 }
880
881 void gf_multby_zero(void *dest, int bytes, int xor)
882 {
883 if (xor) return;
884 bzero(dest, bytes);
885 return;
886 }
887
888 /* JSP - gf_multby_one tries to do this in the most efficient way
889 possible. If xor = 0, then simply call memcpy() since that
890 should be optimized by the system. Otherwise, try to do the xor
891 in the following order:
892
893 If src and dest are aligned with respect to each other on 16-byte
894 boundaries and you have SSE instructions, then use aligned SSE
895 instructions.
896
897 If they aren't but you still have SSE instructions, use unaligned
898 SSE instructions.
899
900 If there are no SSE instructions, but they are aligned with
901 respect to each other on 8-byte boundaries, then do them with
902 uint64_t's.
903
904 Otherwise, call gf_unaligned_xor(), which does the following:
905 align a destination pointer along an 8-byte boundary, and then
906 memcpy 32 bytes at a time from the src pointer to an array of
907 doubles. I'm not sure if that's the best -- probably needs
908 testing, but this seems like it could be a black hole.
909 */
910
911 static void gf_unaligned_xor(void *src, void *dest, int bytes);
912
913 void gf_multby_one(void *src, void *dest, int bytes, int xor)
914 {
915 unsigned long uls, uld;
916 uint8_t *s8, *d8;
917 uint64_t *s64, *d64, *dtop64;
918 gf_region_data rd;
919
920 if (!xor) {
921 if (dest != src)
922 memcpy(dest, src, bytes);
923 return;
924 }
925 uls = (unsigned long) src;
926 uld = (unsigned long) dest;
927
928 #ifdef INTEL_SSE2
929 if (gf_cpu_supports_intel_sse2) {
930 __m128i ms, md;
931 int abytes;
932 s8 = (uint8_t *) src;
933 d8 = (uint8_t *) dest;
934 if (uls % 16 == uld % 16) {
935 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
936 while (s8 != rd.s_start) {
937 *d8 ^= *s8;
938 d8++;
939 s8++;
940 }
941 while (s8 < (uint8_t *) rd.s_top) {
942 ms = _mm_load_si128 ((__m128i *)(s8));
943 md = _mm_load_si128 ((__m128i *)(d8));
944 md = _mm_xor_si128(md, ms);
945 _mm_store_si128((__m128i *)(d8), md);
946 s8 += 16;
947 d8 += 16;
948 }
949 while (s8 != (uint8_t *) src + bytes) {
950 *d8 ^= *s8;
951 d8++;
952 s8++;
953 }
954 return;
955 }
956
957 abytes = (bytes & 0xfffffff0);
958
959 while (d8 < (uint8_t *) dest + abytes) {
960 ms = _mm_loadu_si128 ((__m128i *)(s8));
961 md = _mm_loadu_si128 ((__m128i *)(d8));
962 md = _mm_xor_si128(md, ms);
963 _mm_storeu_si128((__m128i *)(d8), md);
964 s8 += 16;
965 d8 += 16;
966 }
967 while (d8 != (uint8_t *) dest+bytes) {
968 *d8 ^= *s8;
969 d8++;
970 s8++;
971 }
972 return;
973 }
974 #endif
975 #if defined(ARM_NEON)
976 if (gf_cpu_supports_arm_neon) {
977 s8 = (uint8_t *) src;
978 d8 = (uint8_t *) dest;
979
980 if (uls % 16 == uld % 16) {
981 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
982 while (s8 != rd.s_start) {
983 *d8 ^= *s8;
984 s8++;
985 d8++;
986 }
987 while (s8 < (uint8_t *) rd.s_top) {
988 uint8x16_t vs = vld1q_u8 (s8);
989 uint8x16_t vd = vld1q_u8 (d8);
990 uint8x16_t vr = veorq_u8 (vs, vd);
991 vst1q_u8 (d8, vr);
992 s8 += 16;
993 d8 += 16;
994 }
995 } else {
996 while (s8 + 15 < (uint8_t *) src + bytes) {
997 uint8x16_t vs = vld1q_u8 (s8);
998 uint8x16_t vd = vld1q_u8 (d8);
999 uint8x16_t vr = veorq_u8 (vs, vd);
1000 vst1q_u8 (d8, vr);
1001 s8 += 16;
1002 d8 += 16;
1003 }
1004 }
1005 while (s8 < (uint8_t *) src + bytes) {
1006 *d8 ^= *s8;
1007 s8++;
1008 d8++;
1009 }
1010 return;
1011 }
1012 #endif
1013 if (uls % 8 != uld % 8) {
1014 gf_unaligned_xor(src, dest, bytes);
1015 return;
1016 }
1017
1018 gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
1019 s8 = (uint8_t *) src;
1020 d8 = (uint8_t *) dest;
1021 while (d8 != rd.d_start) {
1022 *d8 ^= *s8;
1023 d8++;
1024 s8++;
1025 }
1026 dtop64 = (uint64_t *) rd.d_top;
1027
1028 d64 = (uint64_t *) rd.d_start;
1029 s64 = (uint64_t *) rd.s_start;
1030
1031 while (d64 < dtop64) {
1032 *d64 ^= *s64;
1033 d64++;
1034 s64++;
1035 }
1036
1037 s8 = (uint8_t *) rd.s_top;
1038 d8 = (uint8_t *) rd.d_top;
1039
1040 while (d8 != (uint8_t *) dest+bytes) {
1041 *d8 ^= *s8;
1042 d8++;
1043 s8++;
1044 }
1045 return;
1046 }
1047
1048 #define UNALIGNED_BUFSIZE (8)
1049
1050 static void gf_unaligned_xor(void *src, void *dest, int bytes)
1051 {
1052 uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
1053 int i;
1054 gf_region_data rd;
1055 uint8_t *s8, *d8;
1056
1057 /* JSP - call gf_set_region_data(), but use dest in both places. This is
1058 because I only want to set up dest. If I used src, gf_set_region_data()
1059 would fail because src and dest are not aligned to each other wrt
1060 8-byte pointers. I know this will actually align d_start to 16 bytes.
1061 If I change gf_set_region_data() to split alignment & chunksize, then
1062 I could do this correctly. */
1063
1064 gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
1065 s8 = (uint8_t *) src;
1066 d8 = (uint8_t *) dest;
1067
1068 while (d8 < (uint8_t *) rd.d_start) {
1069 *d8 ^= *s8;
1070 d8++;
1071 s8++;
1072 }
1073
1074 d64 = (uint64_t *) d8;
1075 while (d64 < (uint64_t *) rd.d_top) {
1076 memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
1077 s8 += 8*UNALIGNED_BUFSIZE;
1078 for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
1079 *d64 ^= scopy[i];
1080 d64++;
1081 }
1082 }
1083
1084 d8 = (uint8_t *) d64;
1085 while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
1086 *d8 ^= *s8;
1087 d8++;
1088 s8++;
1089 }
1090 }