ceph/src/erasure-code/jerasure/gf-complete/src/gf.c

   1 /*
   2  * GF-Complete: A Comprehensive Open Source Library for Galois Field Arithmetic
   3  * James S. Plank, Ethan L. Miller, Kevin M. Greenan,
   4  * Benjamin A. Arnold, John A. Burnum, Adam W. Disney, Allen C. McBride.
   5  *
   6  * gf.c
   7  *
   8  * Generic routines for Galois fields
   9  */
  10
  11 #include "gf_int.h"
  12 #include <stdio.h>
  13 #include <stdlib.h>
  14 #include <assert.h>
  15 #include "gf_cpu.h"
  16
  17 int _gf_errno = GF_E_DEFAULT;
  18
  19 void gf_error()
  20 {
  21   char *s;
  22
  23   switch(_gf_errno) {
  24     case GF_E_DEFAULT: s = "No Error."; break;
  25     case GF_E_TWOMULT: s = "Cannot specify two -m's."; break;
  26     case GF_E_TWO_DIV: s = "Cannot specify two -d's."; break;
  27     case GF_E_POLYSPC: s = "-p needs to be followed by a number in hex (0x optional)."; break;
  28     case GF_E_GROUPAR: s = "Ran out of arguments in -m GROUP."; break;
  29     case GF_E_GROUPNU: s = "In -m GROUP g_s g_r -- g_s and g_r need to be numbers."; break;
  30     case GF_E_SPLITAR: s = "Ran out of arguments in -m SPLIT."; break;
  31     case GF_E_SPLITNU: s = "In -m SPLIT w_a w_b -- w_a and w_b need to be numbers."; break;
  32     case GF_E_FEWARGS: s = "Not enough arguments (Perhaps end with '-'?)"; break;
  33     case GF_E_CFM___W: s = "-m CARRY_FREE, w must be 4, 8, 16, 32, 64 or 128."; break;
  34     case GF_E_COMPXPP: s = "-m COMPOSITE, No poly specified, and we don't have a default for the given sub-field."; break;
  35     case GF_E_BASE__W: s = "-m COMPOSITE and the base field is not for w/2."; break;
  36     case GF_E_CFM4POL: s = "-m CARRY_FREE, w=4. (Prim-poly & 0xc) must equal 0."; break;
  37     case GF_E_CFM8POL: s = "-m CARRY_FREE, w=8. (Prim-poly & 0x80) must equal 0."; break;
  38     case GF_E_CF16POL: s = "-m CARRY_FREE, w=16. (Prim-poly & 0xe000) must equal 0."; break;
  39     case GF_E_CF32POL: s = "-m CARRY_FREE, w=32. (Prim-poly & 0xfe000000) must equal 0."; break;
  40     case GF_E_CF64POL: s = "-m CARRY_FREE, w=64. (Prim-poly & 0xfffe000000000000ULL) must equal 0."; break;
  41     case GF_E_MDEFDIV: s = "If multiplication method == default, can't change division."; break;
  42     case GF_E_MDEFREG: s = "If multiplication method == default, can't change region."; break;
  43     case GF_E_MDEFARG: s = "If multiplication method == default, can't use arg1/arg2."; break;
  44     case GF_E_DIVCOMP: s = "Cannot change the division technique with -m COMPOSITE."; break;
  45     case GF_E_DOUQUAD: s = "Cannot specify -r DOUBLE and -r QUAD."; break;
  46     case GF_E_SIMD_NO: s = "Cannot specify -r SIMD and -r NOSIMD."; break;
  47     case GF_E_CAUCHYB: s = "Cannot specify -r CAUCHY and any other -r."; break;
  48     case GF_E_CAUCOMP: s = "Cannot specify -m COMPOSITE and -r CAUCHY."; break;
  49     case GF_E_CAUGT32: s = "Cannot specify -r CAUCHY with w > 32."; break;
  50     case GF_E_ARG1SET: s = "Only use arg1 with SPLIT, GROUP or COMPOSITE."; break;
  51     case GF_E_ARG2SET: s = "Only use arg2 with SPLIT or GROUP."; break;
  52     case GF_E_MATRIXW: s = "Cannot specify -d MATRIX with w > 32."; break;
  53     case GF_E_BAD___W: s = "W must be 1-32, 64 or 128."; break;
  54     case GF_E_DOUBLET: s = "Can only specify -r DOUBLE with -m TABLE."; break;
  55     case GF_E_DOUBLEW: s = "Can only specify -r DOUBLE w = 4 or w = 8."; break;
  56     case GF_E_DOUBLEJ: s = "Cannot specify -r DOUBLE with -r ALTMAP|SIMD|NOSIMD."; break;
  57     case GF_E_DOUBLEL: s = "Can only specify -r DOUBLE -r LAZY with w = 8"; break;
  58     case GF_E_QUAD__T: s = "Can only specify -r QUAD with -m TABLE."; break;
  59     case GF_E_QUAD__W: s = "Can only specify -r QUAD w = 4."; break;
  60     case GF_E_QUAD__J: s = "Cannot specify -r QUAD with -r ALTMAP|SIMD|NOSIMD."; break;
  61     case GF_E_BADPOLY: s = "Bad primitive polynomial (high bits set)."; break;
  62     case GF_E_COMP_PP: s = "Bad primitive polynomial -- bigger than sub-field."; break;
  63     case GF_E_LAZY__X: s = "If -r LAZY, then -r must be DOUBLE or QUAD."; break;
  64     case GF_E_ALTSHIF: s = "Cannot specify -m SHIFT and -r ALTMAP."; break;
  65     case GF_E_SSESHIF: s = "Cannot specify -m SHIFT and -r SIMD|NOSIMD."; break;
  66     case GF_E_ALT_CFM: s = "Cannot specify -m CARRY_FREE and -r ALTMAP."; break;
  67     case GF_E_SSE_CFM: s = "Cannot specify -m CARRY_FREE and -r SIMD|NOSIMD."; break;
  68     case GF_E_PCLMULX: s = "Specified -m CARRY_FREE, but PCLMUL is not supported."; break;
  69     case GF_E_ALT_BY2: s = "Cannot specify -m BYTWO_x and -r ALTMAP."; break;
  70     case GF_E_BY2_SSE: s = "Specified -m BYTWO_x -r SIMD, but SSE2 is not supported."; break;
  71     case GF_E_LOGBADW: s = "With Log Tables, w must be <= 27."; break;
  72     case GF_E_LOG___J: s = "Cannot use Log tables with -r ALTMAP|SIMD|NOSIMD."; break;
  73     case GF_E_LOGPOLY: s = "Cannot use Log tables because the polynomial is not primitive."; break;
  74     case GF_E_ZERBADW: s = "With -m LOG_ZERO, w must be 8 or 16."; break;
  75     case GF_E_ZEXBADW: s = "With -m LOG_ZERO_EXT, w must be 8."; break;
  76     case GF_E_GR_ARGX: s = "With -m GROUP, arg1 and arg2 must be >= 0."; break;
  77     case GF_E_GR_W_48: s = "With -m GROUP, w cannot be 4 or 8."; break;
  78     case GF_E_GR_W_16: s = "With -m GROUP, w == 16, arg1 and arg2 must be 4."; break;
  79     case GF_E_GR_128A: s = "With -m GROUP, w == 128, arg1 must be 4, and arg2 in { 4,8,16 }."; break;
  80     case GF_E_GR_A_27: s = "With -m GROUP, arg1 and arg2 must be <= 27."; break;
  81     case GF_E_GR_AR_W: s = "With -m GROUP, arg1 and arg2 must be <= w."; break;
  82     case GF_E_GR____J: s = "Cannot use GROUP with -r ALTMAP|SIMD|NOSIMD."; break;
  83     case GF_E_TABLE_W: s = "With -m TABLE, w must be < 15, or == 16."; break;
  84     case GF_E_TAB_SSE: s = "With -m TABLE, SIMD|NOSIMD only applies to w=4."; break;
  85     case GF_E_TABSSE3: s = "With -m TABLE, -r SIMD, you need SSSE3 supported."; break;
  86     case GF_E_TAB_ALT: s = "With -m TABLE, you cannot use ALTMAP."; break;
  87     case GF_E_SP128AR: s = "With -m SPLIT, w=128, bad arg1/arg2."; break;
  88     case GF_E_SP128AL: s = "With -m SPLIT, w=128, -r SIMD requires -r ALTMAP."; break;
  89     case GF_E_SP128AS: s = "With -m SPLIT, w=128, ALTMAP needs SSSE3 supported."; break;
  90     case GF_E_SP128_A: s = "With -m SPLIT, w=128, -r ALTMAP only with arg1/arg2 = 4/128."; break;
  91     case GF_E_SP128_S: s = "With -m SPLIT, w=128, -r SIMD|NOSIMD only with arg1/arg2 = 4/128."; break;
  92     case GF_E_SPLIT_W: s = "With -m SPLIT, w must be in {8, 16, 32, 64, 128}."; break;
  93     case GF_E_SP_16AR: s = "With -m SPLIT, w=16, Bad arg1/arg2."; break;
  94     case GF_E_SP_16_A: s = "With -m SPLIT, w=16, -r ALTMAP only with arg1/arg2 = 4/16."; break;
  95     case GF_E_SP_16_S: s = "With -m SPLIT, w=16, -r SIMD|NOSIMD only with arg1/arg2 = 4/16."; break;
  96     case GF_E_SP_32AR: s = "With -m SPLIT, w=32, Bad arg1/arg2."; break;
  97     case GF_E_SP_32AS: s = "With -m SPLIT, w=32, -r ALTMAP needs SSSE3 supported."; break;
  98     case GF_E_SP_32_A: s = "With -m SPLIT, w=32, -r ALTMAP only with arg1/arg2 = 4/32."; break;
  99     case GF_E_SP_32_S: s = "With -m SPLIT, w=32, -r SIMD|NOSIMD only with arg1/arg2 = 4/32."; break;
 100     case GF_E_SP_64AR: s = "With -m SPLIT, w=64, Bad arg1/arg2."; break;
 101     case GF_E_SP_64AS: s = "With -m SPLIT, w=64, -r ALTMAP needs SSSE3 supported."; break;
 102     case GF_E_SP_64_A: s = "With -m SPLIT, w=64, -r ALTMAP only with arg1/arg2 = 4/64."; break;
 103     case GF_E_SP_64_S: s = "With -m SPLIT, w=64, -r SIMD|NOSIMD only with arg1/arg2 = 4/64."; break;
 104     case GF_E_SP_8_AR: s = "With -m SPLIT, w=8, Bad arg1/arg2."; break;
 105     case GF_E_SP_8__A: s = "With -m SPLIT, w=8, Can't have -r ALTMAP."; break;
 106     case GF_E_SP_SSE3: s = "With -m SPLIT, Need SSSE3 support for SIMD."; break;
 107     case GF_E_COMP_A2: s = "With -m COMPOSITE, arg1 must equal 2."; break;
 108     case GF_E_COMP_SS: s = "With -m COMPOSITE, -r SIMD and -r NOSIMD do not apply."; break;
 109     case GF_E_COMP__W: s = "With -m COMPOSITE, w must be 8, 16, 32, 64 or 128."; break;
 110     case GF_E_UNKFLAG: s = "Unknown method flag - should be -m, -d, -r or -p."; break;
 111     case GF_E_UNKNOWN: s = "Unknown multiplication type."; break;
 112     case GF_E_UNK_REG: s = "Unknown region type."; break;
 113     case GF_E_UNK_DIV: s = "Unknown division type."; break;
 114     default: s = "Undefined error.";
 115   }
 116
 117   fprintf(stderr, "%s\n", s);
 118 }
 119
 120 uint64_t gf_composite_get_default_poly(gf_t *base)
 121 {
 122   gf_internal_t *h;
 123   uint64_t rv;
 124
 125   h = (gf_internal_t *) base->scratch;
 126   if (h->w == 4) {
 127     if (h->mult_type == GF_MULT_COMPOSITE) return 0;
 128     if (h->prim_poly == 0x13) return 2;
 129     return 0;
 130   }
 131   if (h->w == 8) {
 132     if (h->mult_type == GF_MULT_COMPOSITE) return 0;
 133     if (h->prim_poly == 0x11d) return 3;
 134     return 0;
 135   }
 136   if (h->w == 16) {
 137     if (h->mult_type == GF_MULT_COMPOSITE) {
 138       rv = gf_composite_get_default_poly(h->base_gf);
 139       if (rv != h->prim_poly) return 0;
 140       if (rv == 3) return 0x105;
 141       return 0;
 142     } else {
 143       if (h->prim_poly == 0x1100b) return 2;
 144       if (h->prim_poly == 0x1002d) return 7;
 145       return 0;
 146     }
 147   }
 148   if (h->w == 32) {
 149     if (h->mult_type == GF_MULT_COMPOSITE) {
 150       rv = gf_composite_get_default_poly(h->base_gf);
 151       if (rv != h->prim_poly) return 0;
 152       if (rv == 2) return 0x10005;
 153       if (rv == 7) return 0x10008;
 154       if (rv == 0x105) return 0x10002;
 155       return 0;
 156     } else {
 157       if (h->prim_poly == 0x400007) return 2;
 158       if (h->prim_poly == 0xc5) return 3;
 159       return 0;
 160     }
 161   }
 162   if (h->w == 64) {
 163     if (h->mult_type == GF_MULT_COMPOSITE) {
 164       rv = gf_composite_get_default_poly(h->base_gf);
 165       if (rv != h->prim_poly) return 0;
 166       if (rv == 3) return 0x100000009ULL;
 167       if (rv == 2) return 0x100000004ULL;
 168       if (rv == 0x10005) return 0x100000003ULL;
 169       if (rv == 0x10002) return 0x100000005ULL;
 170       if (rv == 0x10008) return 0x100000006ULL;  /* JSP: (0x0x100000003 works too,
 171                                                     but I want to differentiate cases). */
 172       return 0;
 173     } else {
 174       if (h->prim_poly == 0x1bULL) return 2;
 175       return 0;
 176     }
 177   }
 178   return 0;
 179 }
 180
 181 int gf_error_check(int w, int mult_type, int region_type, int divide_type,
 182                    int arg1, int arg2, uint64_t poly, gf_t *base)
 183 {
 184   int sse3 = 0;
 185   int sse2 = 0;
 186   int pclmul = 0;
 187   int rdouble, rquad, rlazy, rsimd, rnosimd, raltmap, rcauchy, tmp;
 188   gf_internal_t *sub;
 189
 190   rdouble = (region_type & GF_REGION_DOUBLE_TABLE);
 191   rquad   = (region_type & GF_REGION_QUAD_TABLE);
 192   rlazy   = (region_type & GF_REGION_LAZY);
 193   rsimd   = (region_type & GF_REGION_SIMD);
 194   rnosimd = (region_type & GF_REGION_NOSIMD);
 195   raltmap = (region_type & GF_REGION_ALTMAP);
 196   rcauchy = (region_type & GF_REGION_CAUCHY);
 197
 198   if (divide_type != GF_DIVIDE_DEFAULT &&
 199       divide_type != GF_DIVIDE_MATRIX &&
 200       divide_type != GF_DIVIDE_EUCLID) {
 201     _gf_errno = GF_E_UNK_DIV;
 202     return 0;
 203   }
 204
 205   tmp = ( GF_REGION_DOUBLE_TABLE | GF_REGION_QUAD_TABLE | GF_REGION_LAZY |
 206           GF_REGION_SIMD | GF_REGION_NOSIMD | GF_REGION_ALTMAP |
 207           GF_REGION_CAUCHY );
 208   if (region_type & (~tmp)) { _gf_errno = GF_E_UNK_REG; return 0; }
 209
 210 #ifdef INTEL_SSE2
 211   if (gf_cpu_supports_intel_sse2) {
 212     sse2 = 1;
 213   }
 214 #endif
 215
 216 #ifdef INTEL_SSSE3
 217   if (gf_cpu_supports_intel_ssse3) {
 218     sse3 = 1;
 219   }
 220 #endif
 221
 222 #ifdef INTEL_SSE4_PCLMUL
 223   if (gf_cpu_supports_intel_pclmul) {
 224     pclmul = 1;
 225   }
 226 #endif
 227
 228 #ifdef ARM_NEON
 229   if (gf_cpu_supports_arm_neon) {
 230     pclmul = (w == 4 || w == 8);
 231     sse3 = 1;
 232   }
 233 #endif
 234
 235
 236   if (w < 1 || (w > 32 && w != 64 && w != 128)) { _gf_errno = GF_E_BAD___W; return 0; }
 237
 238   if (mult_type != GF_MULT_COMPOSITE && w < 64) {
 239     if ((poly >> (w+1)) != 0)                   { _gf_errno = GF_E_BADPOLY; return 0; }
 240   }
 241
 242   if (mult_type == GF_MULT_DEFAULT) {
 243     if (divide_type != GF_DIVIDE_DEFAULT) { _gf_errno = GF_E_MDEFDIV; return 0; }
 244     if (region_type != GF_REGION_DEFAULT) { _gf_errno = GF_E_MDEFREG; return 0; }
 245     if (arg1 != 0 || arg2 != 0)           { _gf_errno = GF_E_MDEFARG; return 0; }
 246     return 1;
 247   }
 248
 249   if (rsimd && rnosimd)                              { _gf_errno = GF_E_SIMD_NO; return 0; }
 250   if (rcauchy && w > 32)                             { _gf_errno = GF_E_CAUGT32; return 0; }
 251   if (rcauchy && region_type != GF_REGION_CAUCHY)    { _gf_errno = GF_E_CAUCHYB; return 0; }
 252   if (rcauchy && mult_type == GF_MULT_COMPOSITE)     { _gf_errno = GF_E_CAUCOMP; return 0; }
 253
 254   if (arg1 != 0 && mult_type != GF_MULT_COMPOSITE &&
 255       mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
 256     _gf_errno = GF_E_ARG1SET;
 257     return 0;
 258   }
 259
 260   if (arg2 != 0 && mult_type != GF_MULT_SPLIT_TABLE && mult_type != GF_MULT_GROUP) {
 261     _gf_errno = GF_E_ARG2SET;
 262     return 0;
 263   }
 264
 265   if (divide_type == GF_DIVIDE_MATRIX && w > 32) { _gf_errno = GF_E_MATRIXW; return 0; }
 266
 267   if (rdouble) {
 268     if (rquad)                      { _gf_errno = GF_E_DOUQUAD; return 0; }
 269     if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_DOUBLET; return 0; }
 270     if (w != 4 && w != 8)           { _gf_errno = GF_E_DOUBLEW; return 0; }
 271     if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_DOUBLEJ; return 0; }
 272     if (rlazy && w == 4)            { _gf_errno = GF_E_DOUBLEL; return 0; }
 273     return 1;
 274   }
 275
 276   if (rquad) {
 277     if (mult_type != GF_MULT_TABLE) { _gf_errno = GF_E_QUAD__T; return 0; }
 278     if (w != 4)                     { _gf_errno = GF_E_QUAD__W; return 0; }
 279     if (rsimd || rnosimd || raltmap) { _gf_errno = GF_E_QUAD__J; return 0; }
 280     return 1;
 281   }
 282
 283   if (rlazy)                        { _gf_errno = GF_E_LAZY__X; return 0; }
 284
 285   if (mult_type == GF_MULT_SHIFT) {
 286     if (raltmap)                    { _gf_errno = GF_E_ALTSHIF; return 0; }
 287     if (rsimd || rnosimd)           { _gf_errno = GF_E_SSESHIF; return 0; }
 288     return 1;
 289   }
 290
 291   if (mult_type == GF_MULT_CARRY_FREE) {
 292     if (w != 4 && w != 8 && w != 16 &&
 293         w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
 294     if (w == 4 && (poly & 0xc))                    { _gf_errno = GF_E_CFM4POL; return 0; }
 295     if (w == 8 && (poly & 0x80))                   { _gf_errno = GF_E_CFM8POL; return 0; }
 296     if (w == 16 && (poly & 0xe000))                { _gf_errno = GF_E_CF16POL; return 0; }
 297     if (w == 32 && (poly & 0xfe000000))            { _gf_errno = GF_E_CF32POL; return 0; }
 298     if (w == 64 && (poly & 0xfffe000000000000ULL)) { _gf_errno = GF_E_CF64POL; return 0; }
 299     if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
 300     if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
 301     if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
 302     return 1;
 303   }
 304
 305   if (mult_type == GF_MULT_CARRY_FREE_GK) {
 306     if (w != 4 && w != 8 && w != 16 &&
 307         w != 32 && w != 64 && w != 128)            { _gf_errno = GF_E_CFM___W; return 0; }
 308     if (raltmap)                                   { _gf_errno = GF_E_ALT_CFM; return 0; }
 309     if (rsimd || rnosimd)                          { _gf_errno = GF_E_SSE_CFM; return 0; }
 310     if (!pclmul)                                   { _gf_errno = GF_E_PCLMULX; return 0; }
 311     return 1;
 312   }
 313
 314   if (mult_type == GF_MULT_BYTWO_p || mult_type == GF_MULT_BYTWO_b) {
 315     if (raltmap)                    { _gf_errno = GF_E_ALT_BY2; return 0; }
 316     if (rsimd && !sse2)              { _gf_errno = GF_E_BY2_SSE; return 0; }
 317     return 1;
 318   }
 319
 320   if (mult_type == GF_MULT_LOG_TABLE || mult_type == GF_MULT_LOG_ZERO
 321                                      || mult_type == GF_MULT_LOG_ZERO_EXT ) {
 322     if (w > 27)                     { _gf_errno = GF_E_LOGBADW; return 0; }
 323     if (raltmap || rsimd || rnosimd) { _gf_errno = GF_E_LOG___J; return 0; }
 324
 325     if (mult_type == GF_MULT_LOG_TABLE) return 1;
 326
 327     if (w != 8 && w != 16)          { _gf_errno = GF_E_ZERBADW; return 0; }
 328
 329     if (mult_type == GF_MULT_LOG_ZERO) return 1;
 330
 331     if (w != 8)                     { _gf_errno = GF_E_ZEXBADW; return 0; }
 332     return 1;
 333   }
 334
 335   if (mult_type == GF_MULT_GROUP) {
 336     if (arg1 <= 0 || arg2 <= 0)                 { _gf_errno = GF_E_GR_ARGX; return 0; }
 337     if (w == 4 || w == 8)                       { _gf_errno = GF_E_GR_W_48; return 0; }
 338     if (w == 16 && (arg1 != 4 || arg2 != 4))     { _gf_errno = GF_E_GR_W_16; return 0; }
 339     if (w == 128 && (arg1 != 4 ||
 340        (arg2 != 4 && arg2 != 8 && arg2 != 16))) { _gf_errno = GF_E_GR_128A; return 0; }
 341     if (arg1 > 27 || arg2 > 27)                 { _gf_errno = GF_E_GR_A_27; return 0; }
 342     if (arg1 > w || arg2 > w)                   { _gf_errno = GF_E_GR_AR_W; return 0; }
 343     if (raltmap || rsimd || rnosimd)            { _gf_errno = GF_E_GR____J; return 0; }
 344     return 1;
 345   }
 346
 347   if (mult_type == GF_MULT_TABLE) {
 348     if (w != 16 && w >= 15)                     { _gf_errno = GF_E_TABLE_W; return 0; }
 349     if (w != 4 && (rsimd || rnosimd))           { _gf_errno = GF_E_TAB_SSE; return 0; }
 350     if (rsimd && !sse3)                         { _gf_errno = GF_E_TABSSE3; return 0; }
 351     if (raltmap)                                { _gf_errno = GF_E_TAB_ALT; return 0; }
 352     return 1;
 353   }
 354
 355   if (mult_type == GF_MULT_SPLIT_TABLE) {
 356     if (arg1 > arg2) {
 357       tmp = arg1;
 358       arg1 = arg2;
 359       arg2 = tmp;
 360     }
 361     if (w == 8) {
 362       if (arg1 != 4 || arg2 != 8)               { _gf_errno = GF_E_SP_8_AR; return 0; }
 363       if (rsimd && !sse3)                       { _gf_errno = GF_E_SP_SSE3; return 0; }
 364       if (raltmap)                              { _gf_errno = GF_E_SP_8__A; return 0; }
 365     } else if (w == 16) {
 366       if ((arg1 == 8 && arg2 == 8) ||
 367           (arg1 == 8 && arg2 == 16)) {
 368         if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_16_S; return 0; }
 369         if (raltmap)                            { _gf_errno = GF_E_SP_16_A; return 0; }
 370       } else if (arg1 == 4 && arg2 == 16) {
 371         if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
 372       } else                                    { _gf_errno = GF_E_SP_16AR; return 0; }
 373     } else if (w == 32) {
 374       if ((arg1 == 8 && arg2 == 8) ||
 375           (arg1 == 8 && arg2 == 32) ||
 376           (arg1 == 16 && arg2 == 32)) {
 377         if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_32_S; return 0; }
 378         if (raltmap)                            { _gf_errno = GF_E_SP_32_A; return 0; }
 379       } else if (arg1 == 4 && arg2 == 32) {
 380         if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
 381         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_32AS; return 0; }
 382         if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_32AS; return 0; }
 383       } else                                    { _gf_errno = GF_E_SP_32AR; return 0; }
 384     } else if (w == 64) {
 385       if ((arg1 == 8 && arg2 == 8) ||
 386           (arg1 == 8 && arg2 == 64) ||
 387           (arg1 == 16 && arg2 == 64)) {
 388         if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP_64_S; return 0; }
 389         if (raltmap)                            { _gf_errno = GF_E_SP_64_A; return 0; }
 390       } else if (arg1 == 4 && arg2 == 64) {
 391         if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
 392         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP_64AS; return 0; }
 393         if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP_64AS; return 0; }
 394       } else                                    { _gf_errno = GF_E_SP_64AR; return 0; }
 395     } else if (w == 128) {
 396       if (arg1 == 8 && arg2 == 128) {
 397         if (rsimd || rnosimd)                   { _gf_errno = GF_E_SP128_S; return 0; }
 398         if (raltmap)                            { _gf_errno = GF_E_SP128_A; return 0; }
 399       } else if (arg1 == 4 && arg2 == 128) {
 400         if (rsimd && !sse3)                     { _gf_errno = GF_E_SP_SSE3; return 0; }
 401         if (raltmap && !sse3)                   { _gf_errno = GF_E_SP128AS; return 0; }
 402         if (raltmap && rnosimd)                 { _gf_errno = GF_E_SP128AS; return 0; }
 403       } else                                    { _gf_errno = GF_E_SP128AR; return 0; }
 404     } else                                      { _gf_errno = GF_E_SPLIT_W; return 0; }
 405     return 1;
 406   }
 407
 408   if (mult_type == GF_MULT_COMPOSITE) {
 409     if (w != 8 && w != 16 && w != 32
 410                && w != 64 && w != 128)          { _gf_errno = GF_E_COMP__W; return 0; }
 411     if (w < 128 && (poly >> (w/2)) != 0)                   { _gf_errno = GF_E_COMP_PP; return 0; }
 412     if (divide_type != GF_DIVIDE_DEFAULT)       { _gf_errno = GF_E_DIVCOMP; return 0; }
 413     if (arg1 != 2)                              { _gf_errno = GF_E_COMP_A2; return 0; }
 414     if (rsimd || rnosimd)                       { _gf_errno = GF_E_COMP_SS; return 0; }
 415     if (base != NULL) {
 416       sub = (gf_internal_t *) base->scratch;
 417       if (sub->w != w/2)                      { _gf_errno = GF_E_BASE__W; return 0; }
 418       if (poly == 0) {
 419         if (gf_composite_get_default_poly(base) == 0) { _gf_errno = GF_E_COMPXPP; return 0; }
 420       }
 421     }
 422     return 1;
 423   }
 424
 425   _gf_errno = GF_E_UNKNOWN;
 426   return 0;
 427 }
 428
 429 int gf_scratch_size(int w,
 430                     int mult_type,
 431                     int region_type,
 432                     int divide_type,
 433                     int arg1,
 434                     int arg2)
 435 {
 436   if (gf_error_check(w, mult_type, region_type, divide_type, arg1, arg2, 0, NULL) == 0) return 0;
 437
 438   switch(w) {
 439     case 4: return gf_w4_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 440     case 8: return gf_w8_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 441     case 16: return gf_w16_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 442     case 32: return gf_w32_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 443     case 64: return gf_w64_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 444     case 128: return gf_w128_scratch_size(mult_type, region_type, divide_type, arg1, arg2);
 445     default: return gf_wgen_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
 446   }
 447 }
 448
 449 extern int gf_size(gf_t *gf)
 450 {
 451   gf_internal_t *h;
 452   int s;
 453
 454   s = sizeof(gf_t);
 455   h = (gf_internal_t *) gf->scratch;
 456   s += gf_scratch_size(h->w, h->mult_type, h->region_type, h->divide_type, h->arg1, h->arg2);
 457   if (h->mult_type == GF_MULT_COMPOSITE) s += gf_size(h->base_gf);
 458   return s;
 459 }
 460
 461
 462 int gf_init_easy(gf_t *gf, int w)
 463 {
 464   return gf_init_hard(gf, w, GF_MULT_DEFAULT, GF_REGION_DEFAULT, GF_DIVIDE_DEFAULT,
 465                       0, 0, 0, NULL, NULL);
 466 }
 467
 468 /* Allen: What's going on here is this function is putting info into the
 469        scratch mem of gf, and then calling the relevant REAL init
 470        func for the word size.  Probably done this way to consolidate
 471        those aspects of initialization that don't rely on word size,
 472        and then take care of word-size-specific stuff. */
 473
 474 int gf_init_hard(gf_t *gf, int w, int mult_type,
 475                         int region_type,
 476                         int divide_type,
 477                         uint64_t prim_poly,
 478                         int arg1, int arg2,
 479                         gf_t *base_gf,
 480                         void *scratch_memory)
 481 {
 482   int sz;
 483   gf_internal_t *h;
 484
 485   gf_cpu_identify();
 486
 487   if (gf_error_check(w, mult_type, region_type, divide_type,
 488                      arg1, arg2, prim_poly, base_gf) == 0) return 0;
 489
 490   sz = gf_scratch_size(w, mult_type, region_type, divide_type, arg1, arg2);
 491   if (sz <= 0) return 0;  /* This shouldn't happen, as all errors should get caught
 492                              in gf_error_check() */
 493
 494   if (scratch_memory == NULL) {
 495     h = (gf_internal_t *) malloc(sz);
 496     h->free_me = 1;
 497   } else {
 498     h = scratch_memory;
 499     h->free_me = 0;
 500   }
 501   gf->scratch = (void *) h;
 502   h->mult_type = mult_type;
 503   h->region_type = region_type;
 504   h->divide_type = divide_type;
 505   h->w = w;
 506   h->prim_poly = prim_poly;
 507   h->arg1 = arg1;
 508   h->arg2 = arg2;
 509   h->base_gf = base_gf;
 510   h->private = (void *) gf->scratch;
 511   h->private = (uint8_t *)h->private + (sizeof(gf_internal_t));
 512   gf->extract_word.w32 = NULL;
 513
 514   switch(w) {
 515     case 4: return gf_w4_init(gf);
 516     case 8: return gf_w8_init(gf);
 517     case 16: return gf_w16_init(gf);
 518     case 32: return gf_w32_init(gf);
 519     case 64: return gf_w64_init(gf);
 520     case 128: return gf_w128_init(gf);
 521     default: return gf_wgen_init(gf);
 522   }
 523 }
 524
 525 int gf_free(gf_t *gf, int recursive)
 526 {
 527   gf_internal_t *h;
 528
 529   h = (gf_internal_t *) gf->scratch;
 530   if (recursive && h->base_gf != NULL) {
 531     gf_free(h->base_gf, 1);
 532     free(h->base_gf);
 533   }
 534   if (h->free_me) free(h);
 535   return 0; /* Making compiler happy */
 536 }
 537
 538 void gf_alignment_error(char *s, int a)
 539 {
 540   fprintf(stderr, "Alignment error in %s:\n", s);
 541   fprintf(stderr, "   The source and destination buffers must be aligned to each other,\n");
 542   fprintf(stderr, "   and they must be aligned to a %d-byte address.\n", a);
 543   assert(0);
 544 }
 545
 546 static
 547 void gf_invert_binary_matrix(uint32_t *mat, uint32_t *inv, int rows) {
 548   int cols, i, j;
 549   uint32_t tmp;
 550
 551   cols = rows;
 552
 553   for (i = 0; i < rows; i++) inv[i] = (1 << i);
 554
 555   /* First -- convert into upper triangular */
 556
 557   for (i = 0; i < cols; i++) {
 558
 559     /* Swap rows if we ave a zero i,i element.  If we can't swap, then the
 560        matrix was not invertible */
 561
 562     if ((mat[i] & (1 << i)) == 0) {
 563       for (j = i+1; j < rows && (mat[j] & (1 << i)) == 0; j++) ;
 564       if (j == rows) {
 565         fprintf(stderr, "galois_invert_matrix: Matrix not invertible!!\n");
 566         assert(0);
 567       }
 568       tmp = mat[i]; mat[i] = mat[j]; mat[j] = tmp;
 569       tmp = inv[i]; inv[i] = inv[j]; inv[j] = tmp;
 570     }
 571
 572     /* Now for each j>i, add A_ji*Ai to Aj */
 573     for (j = i+1; j != rows; j++) {
 574       if ((mat[j] & (1 << i)) != 0) {
 575         mat[j] ^= mat[i];
 576         inv[j] ^= inv[i];
 577       }
 578     }
 579   }
 580
 581   /* Now the matrix is upper triangular.  Start at the top and multiply down */
 582
 583   for (i = rows-1; i >= 0; i--) {
 584     for (j = 0; j < i; j++) {
 585       if (mat[j] & (1 << i)) {
 586         /*  mat[j] ^= mat[i]; */
 587         inv[j] ^= inv[i];
 588       }
 589     }
 590   }
 591 }
 592
 593 uint32_t gf_bitmatrix_inverse(uint32_t y, int w, uint32_t pp)
 594 {
 595   uint32_t mat[32], inv[32], mask;
 596   int i;
 597
 598   mask = (w == 32) ? 0xffffffff : ((uint32_t)1 << w) - 1;
 599   for (i = 0; i < w; i++) {
 600     mat[i] = y;
 601
 602     if (y & (1 << (w-1))) {
 603       y = y << 1;
 604       y = ((y ^ pp) & mask);
 605     } else {
 606       y = y << 1;
 607     }
 608   }
 609
 610   gf_invert_binary_matrix(mat, inv, w);
 611   return inv[0];
 612 }
 613
 614 void gf_two_byte_region_table_multiply(gf_region_data *rd, uint16_t *base)
 615 {
 616   uint64_t a, prod;
 617   int xor;
 618   uint64_t *s64, *d64, *top;
 619
 620   s64 = rd->s_start;
 621   d64 = rd->d_start;
 622   top = rd->d_top;
 623   xor = rd->xor;
 624
 625   if (xor) {
 626     while (d64 != top) {
 627       a = *s64;
 628       prod = base[a >> 48];
 629       a <<= 16;
 630       prod <<= 16;
 631       prod ^= base[a >> 48];
 632       a <<= 16;
 633       prod <<= 16;
 634       prod ^= base[a >> 48];
 635       a <<= 16;
 636       prod <<= 16;
 637       prod ^= base[a >> 48];
 638       prod ^= *d64;
 639       *d64 = prod;
 640       s64++;
 641       d64++;
 642     }
 643   } else {
 644     while (d64 != top) {
 645       a = *s64;
 646       prod = base[a >> 48];
 647       a <<= 16;
 648       prod <<= 16;
 649       prod ^= base[a >> 48];
 650       a <<= 16;
 651       prod <<= 16;
 652       prod ^= base[a >> 48];
 653       a <<= 16;
 654       prod <<= 16;
 655       prod ^= base[a >> 48];
 656       *d64 = prod;
 657       s64++;
 658       d64++;
 659     }
 660   }
 661 }
 662
 663 static void gf_slow_multiply_region(gf_region_data *rd, void *src, void *dest, void *s_top)
 664 {
 665   uint8_t *s8, *d8;
 666   uint16_t *s16, *d16;
 667   uint32_t *s32, *d32;
 668   uint64_t *s64, *d64;
 669   gf_internal_t *h;
 670   int wb;
 671   uint32_t p, a;
 672
 673   h = rd->gf->scratch;
 674   wb = (h->w)/8;
 675   if (wb == 0) wb = 1;
 676
 677   while (src < s_top) {
 678     switch (h->w) {
 679     case 8:
 680       s8 = (uint8_t *) src;
 681       d8 = (uint8_t *) dest;
 682       *d8 = (rd->xor) ? (*d8 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s8)) :
 683                       rd->gf->multiply.w32(rd->gf, rd->val, *s8);
 684       break;
 685     case 4:
 686       s8 = (uint8_t *) src;
 687       d8 = (uint8_t *) dest;
 688       a = *s8;
 689       p = rd->gf->multiply.w32(rd->gf, rd->val, a&0xf);
 690       p |= (rd->gf->multiply.w32(rd->gf, rd->val, a >> 4) << 4);
 691       if (rd->xor) p ^= *d8;
 692       *d8 = p;
 693       break;
 694     case 16:
 695       s16 = (uint16_t *) src;
 696       d16 = (uint16_t *) dest;
 697       *d16 = (rd->xor) ? (*d16 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s16)) :
 698                       rd->gf->multiply.w32(rd->gf, rd->val, *s16);
 699       break;
 700     case 32:
 701       s32 = (uint32_t *) src;
 702       d32 = (uint32_t *) dest;
 703       *d32 = (rd->xor) ? (*d32 ^ rd->gf->multiply.w32(rd->gf, rd->val, *s32)) :
 704                       rd->gf->multiply.w32(rd->gf, rd->val, *s32);
 705       break;
 706     case 64:
 707       s64 = (uint64_t *) src;
 708       d64 = (uint64_t *) dest;
 709       *d64 = (rd->xor) ? (*d64 ^ rd->gf->multiply.w64(rd->gf, rd->val, *s64)) :
 710                       rd->gf->multiply.w64(rd->gf, rd->val, *s64);
 711       break;
 712     default:
 713       fprintf(stderr, "Error: gf_slow_multiply_region: w=%d not implemented.\n", h->w);
 714       exit(1);
 715     }
 716     src = (uint8_t *)src + wb;
 717     dest = (uint8_t *)dest + wb;
 718   }
 719 }
 720
 721 /* JSP - The purpose of this procedure is to error check alignment,
 722    and to set up the region operation so that it can best leverage
 723    large words.
 724
 725    It stores its information in rd.
 726
 727    Assuming you're not doing Cauchy coding, (see below for that),
 728    then w will be 4, 8, 16, 32 or 64. It can't be 128 (probably
 729    should change that).
 730
 731    src and dest must then be aligned on ceil(w/8)-byte boundaries.
 732    Moreover, bytes must be a multiple of ceil(w/8).  If the variable
 733    align is equal to ceil(w/8), then we will set s_start = src,
 734    d_start = dest, s_top to (src+bytes) and d_top to (dest+bytes).
 735    And we return -- the implementation will go ahead and do the
 736    multiplication on individual words (e.g. using discrete logs).
 737
 738    If align is greater than ceil(w/8), then the implementation needs
 739    to work on groups of "align" bytes.  For example, suppose you are
 740    implementing BYTWO, without SSE. Then you will be doing the region
 741    multiplication in units of 8 bytes, so align = 8. Or, suppose you
 742    are doing a Quad table in GF(2^4). You will be doing the region
 743    multiplication in units of 2 bytes, so align = 2. Or, suppose you
 744    are doing split multiplication with SSE operations in GF(2^8).
 745    Then align = 16. Worse yet, suppose you are doing split
 746    multiplication with SSE operations in GF(2^16), with or without
 747    ALTMAP. Then, you will be doing the multiplication on 256 bits at
 748    a time.  So align = 32.
 749
 750    When align does not equal ceil(w/8), we split the region
 751    multiplication into three parts.  We are going to make s_start be
 752    the first address greater than or equal to src that is a multiple
 753    of align.  s_top is going to be the largest address >= src+bytes
 754    such that (s_top - s_start) is a multiple of align.  We do the
 755    same with d_start and d_top.  When we say that "src and dest must
 756    be aligned with respect to each other, we mean that s_start-src
 757    must equal d_start-dest.
 758
 759    Now, the region multiplication is done in three parts -- the part
 760    between src and s_start must be done using single words.
 761    Similarly, the part between s_top and src+bytes must also be done
 762    using single words.  The part between s_start and s_top will be
 763    done in chunks of "align" bytes.
 764
 765    One final thing -- if align > 16, then s_start and d_start will be
 766    aligned on a 16 byte boundary.  Perhaps we should have two
 767    variables: align and chunksize.  Then we'd have s_start & d_start
 768    aligned to "align", and have s_top-s_start be a multiple of
 769    chunksize.  That may be less confusing, but it would be a big
 770    change.
 771
 772    Finally, if align = -1, then we are doing Cauchy multiplication,
 773    using only XOR's.  In this case, we're not going to care about
 774    alignment because we are just doing XOR's.  Instead, the only
 775    thing we care about is that bytes must be a multiple of w.
 776
 777    This is not to say that alignment doesn't matter in performance
 778    with XOR's.  See that discussion in gf_multby_one().
 779
 780    After you call gf_set_region_data(), the procedure
 781    gf_do_initial_region_alignment() calls gf->multiply.w32() on
 782    everything between src and s_start.  The procedure
 783    gf_do_final_region_alignment() calls gf->multiply.w32() on
 784    everything between s_top and src+bytes.
 785    */
 786
 787 void gf_set_region_data(gf_region_data *rd,
 788   gf_t *gf,
 789   void *src,
 790   void *dest,
 791   int bytes,
 792   uint64_t val,
 793   int xor,
 794   int align)
 795 {
 796   gf_internal_t *h = NULL;
 797   int wb;
 798   uint32_t a;
 799   unsigned long uls, uld;
 800
 801   if (gf == NULL) {  /* JSP - Can be NULL if you're just doing XOR's */
 802     wb = 1;
 803   } else {
 804     h = gf->scratch;
 805     wb = (h->w)/8;
 806     if (wb == 0) wb = 1;
 807   }
 808
 809   rd->gf = gf;
 810   rd->src = src;
 811   rd->dest = dest;
 812   rd->bytes = bytes;
 813   rd->val = val;
 814   rd->xor = xor;
 815   rd->align = align;
 816
 817   uls = (unsigned long) src;
 818   uld = (unsigned long) dest;
 819
 820   a = (align <= 16) ? align : 16;
 821
 822   if (align == -1) { /* JSP: This is cauchy.  Error check bytes, then set up the pointers
 823                         so that there are no alignment regions. */
 824     if (h != NULL && bytes % h->w != 0) {
 825       fprintf(stderr, "Error in region multiply operation.\n");
 826       fprintf(stderr, "The size must be a multiple of %d bytes.\n", h->w);
 827       assert(0);
 828     }
 829
 830     rd->s_start = src;
 831     rd->d_start = dest;
 832     rd->s_top = (uint8_t *)src + bytes;
 833     rd->d_top = (uint8_t *)src + bytes;
 834     return;
 835   }
 836
 837   if (uls % a != uld % a) {
 838     fprintf(stderr, "Error in region multiply operation.\n");
 839     fprintf(stderr, "The source & destination pointers must be aligned with respect\n");
 840     fprintf(stderr, "to each other along a %d byte boundary.\n", a);
 841     fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
 842             (unsigned long) dest);
 843     assert(0);
 844   }
 845
 846   if (uls % wb != 0) {
 847     fprintf(stderr, "Error in region multiply operation.\n");
 848     fprintf(stderr, "The pointers must be aligned along a %d byte boundary.\n", wb);
 849     fprintf(stderr, "Src = 0x%lx.  Dest = 0x%lx\n", (unsigned long) src,
 850             (unsigned long) dest);
 851     assert(0);
 852   }
 853
 854   if (bytes % wb != 0) {
 855     fprintf(stderr, "Error in region multiply operation.\n");
 856     fprintf(stderr, "The size must be a multiple of %d bytes.\n", wb);
 857     assert(0);
 858   }
 859
 860   uls %= a;
 861   if (uls != 0) uls = (a-uls);
 862   rd->s_start = (uint8_t *)rd->src + uls;
 863   rd->d_start = (uint8_t *)rd->dest + uls;
 864   bytes -= uls;
 865   bytes -= (bytes % align);
 866   rd->s_top = (uint8_t *)rd->s_start + bytes;
 867   rd->d_top = (uint8_t *)rd->d_start + bytes;
 868
 869 }
 870
 871 void gf_do_initial_region_alignment(gf_region_data *rd)
 872 {
 873   gf_slow_multiply_region(rd, rd->src, rd->dest, rd->s_start);
 874 }
 875
 876 void gf_do_final_region_alignment(gf_region_data *rd)
 877 {
 878   gf_slow_multiply_region(rd, rd->s_top, rd->d_top, (uint8_t *)rd->src+rd->bytes);
 879 }
 880
 881 void gf_multby_zero(void *dest, int bytes, int xor)
 882 {
 883   if (xor) return;
 884   bzero(dest, bytes);
 885   return;
 886 }
 887
 888 /* JSP - gf_multby_one tries to do this in the most efficient way
 889    possible.  If xor = 0, then simply call memcpy() since that
 890    should be optimized by the system.  Otherwise, try to do the xor
 891    in the following order:
 892
 893    If src and dest are aligned with respect to each other on 16-byte
 894    boundaries and you have SSE instructions, then use aligned SSE
 895    instructions.
 896
 897    If they aren't but you still have SSE instructions, use unaligned
 898    SSE instructions.
 899
 900    If there are no SSE instructions, but they are aligned with
 901    respect to each other on 8-byte boundaries, then do them with
 902    uint64_t's.
 903
 904    Otherwise, call gf_unaligned_xor(), which does the following:
 905    align a destination pointer along an 8-byte boundary, and then
 906    memcpy 32 bytes at a time from the src pointer to an array of
 907    doubles.  I'm not sure if that's the best -- probably needs
 908    testing, but this seems like it could be a black hole.
 909  */
 910
 911 static void gf_unaligned_xor(void *src, void *dest, int bytes);
 912
 913 void gf_multby_one(void *src, void *dest, int bytes, int xor)
 914 {
 915   unsigned long uls, uld;
 916   uint8_t *s8, *d8;
 917   uint64_t *s64, *d64, *dtop64;
 918   gf_region_data rd;
 919
 920   if (!xor) {
 921     if (dest != src)
 922       memcpy(dest, src, bytes);
 923     return;
 924   }
 925   uls = (unsigned long) src;
 926   uld = (unsigned long) dest;
 927
 928 #ifdef   INTEL_SSE2
 929   if (gf_cpu_supports_intel_sse2) {
 930     __m128i ms, md;
 931     int abytes;
 932     s8 = (uint8_t *) src;
 933     d8 = (uint8_t *) dest;
 934     if (uls % 16 == uld % 16) {
 935       gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
 936       while (s8 != rd.s_start) {
 937         *d8 ^= *s8;
 938         d8++;
 939         s8++;
 940       }
 941       while (s8 < (uint8_t *) rd.s_top) {
 942         ms = _mm_load_si128 ((__m128i *)(s8));
 943         md = _mm_load_si128 ((__m128i *)(d8));
 944         md = _mm_xor_si128(md, ms);
 945         _mm_store_si128((__m128i *)(d8), md);
 946         s8 += 16;
 947         d8 += 16;
 948       }
 949       while (s8 != (uint8_t *) src + bytes) {
 950         *d8 ^= *s8;
 951         d8++;
 952         s8++;
 953       }
 954       return;
 955     }
 956
 957     abytes = (bytes & 0xfffffff0);
 958
 959     while (d8 < (uint8_t *) dest + abytes) {
 960       ms = _mm_loadu_si128 ((__m128i *)(s8));
 961       md = _mm_loadu_si128 ((__m128i *)(d8));
 962       md = _mm_xor_si128(md, ms);
 963       _mm_storeu_si128((__m128i *)(d8), md);
 964       s8 += 16;
 965       d8 += 16;
 966     }
 967     while (d8 != (uint8_t *) dest+bytes) {
 968       *d8 ^= *s8;
 969       d8++;
 970       s8++;
 971     }
 972     return;
 973   }
 974 #endif
 975 #if defined(ARM_NEON)
 976   if (gf_cpu_supports_arm_neon) {
 977     s8 = (uint8_t *) src;
 978     d8 = (uint8_t *) dest;
 979
 980     if (uls % 16 == uld % 16) {
 981       gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 16);
 982       while (s8 != rd.s_start) {
 983         *d8 ^= *s8;
 984         s8++;
 985         d8++;
 986       }
 987       while (s8 < (uint8_t *) rd.s_top) {
 988         uint8x16_t vs = vld1q_u8 (s8);
 989         uint8x16_t vd = vld1q_u8 (d8);
 990         uint8x16_t vr = veorq_u8 (vs, vd);
 991         vst1q_u8 (d8, vr);
 992         s8 += 16;
 993         d8 += 16;
 994       }
 995     } else {
 996       while (s8 + 15 < (uint8_t *) src + bytes) {
 997         uint8x16_t vs = vld1q_u8 (s8);
 998         uint8x16_t vd = vld1q_u8 (d8);
 999         uint8x16_t vr = veorq_u8 (vs, vd);
1000         vst1q_u8 (d8, vr);
1001         s8 += 16;
1002         d8 += 16;
1003       }
1004     }
1005     while (s8 < (uint8_t *) src + bytes) {
1006       *d8 ^= *s8;
1007       s8++;
1008       d8++;
1009     }
1010     return;
1011   }
1012 #endif
1013   if (uls % 8 != uld % 8) {
1014     gf_unaligned_xor(src, dest, bytes);
1015     return;
1016   }
1017
1018   gf_set_region_data(&rd, NULL, src, dest, bytes, 1, xor, 8);
1019   s8 = (uint8_t *) src;
1020   d8 = (uint8_t *) dest;
1021   while (d8 != rd.d_start) {
1022     *d8 ^= *s8;
1023     d8++;
1024     s8++;
1025   }
1026   dtop64 = (uint64_t *) rd.d_top;
1027
1028   d64 = (uint64_t *) rd.d_start;
1029   s64 = (uint64_t *) rd.s_start;
1030
1031   while (d64 < dtop64) {
1032     *d64 ^= *s64;
1033     d64++;
1034     s64++;
1035   }
1036
1037   s8 = (uint8_t *) rd.s_top;
1038   d8 = (uint8_t *) rd.d_top;
1039
1040   while (d8 != (uint8_t *) dest+bytes) {
1041     *d8 ^= *s8;
1042     d8++;
1043     s8++;
1044   }
1045   return;
1046 }
1047
1048 #define UNALIGNED_BUFSIZE (8)
1049
1050 static void gf_unaligned_xor(void *src, void *dest, int bytes)
1051 {
1052   uint64_t scopy[UNALIGNED_BUFSIZE], *d64;
1053   int i;
1054   gf_region_data rd;
1055   uint8_t *s8, *d8;
1056
1057   /* JSP - call gf_set_region_data(), but use dest in both places.  This is
1058      because I only want to set up dest.  If I used src, gf_set_region_data()
1059      would fail because src and dest are not aligned to each other wrt
1060      8-byte pointers.  I know this will actually align d_start to 16 bytes.
1061      If I change gf_set_region_data() to split alignment & chunksize, then
1062      I could do this correctly. */
1063
1064   gf_set_region_data(&rd, NULL, dest, dest, bytes, 1, 1, 8*UNALIGNED_BUFSIZE);
1065   s8 = (uint8_t *) src;
1066   d8 = (uint8_t *) dest;
1067
1068   while (d8 < (uint8_t *) rd.d_start) {
1069     *d8 ^= *s8;
1070     d8++;
1071     s8++;
1072   }
1073
1074   d64 = (uint64_t *) d8;
1075   while (d64 < (uint64_t *) rd.d_top) {
1076     memcpy(scopy, s8, 8*UNALIGNED_BUFSIZE);
1077     s8 += 8*UNALIGNED_BUFSIZE;
1078     for (i = 0; i < UNALIGNED_BUFSIZE; i++) {
1079       *d64 ^= scopy[i];
1080       d64++;
1081     }
1082   }
1083
1084   d8 = (uint8_t *) d64;
1085   while (d8 < (uint8_t *) ((uint8_t *)dest+bytes)) {
1086     *d8 ^= *s8;
1087     d8++;
1088     s8++;
1089   }
1090 }