module/icp/algs/skein/skein_block.c

   1 /*
   2  * Implementation of the Skein block functions.
   3  * Source code author: Doug Whiting, 2008.
   4  * This algorithm and source code is released to the public domain.
   5  * Compile-time switches:
   6  *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
   7  *                    versions use ASM code for block processing
   8  *                    [default: use C for all block sizes]
   9  */
  10 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
  11
  12 #include <sys/skein.h>
  13 #include "skein_impl.h"
  14 #include <sys/isa_defs.h>       /* for _ILP32 */
  15
  16 #ifndef SKEIN_USE_ASM
  17 #define SKEIN_USE_ASM   (0)     /* default is all C code (no ASM) */
  18 #endif
  19
  20 #ifndef SKEIN_LOOP
  21 /*
  22  * The low-level checksum routines use a lot of stack space. On systems where
  23  * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
  24  * checksum calculations to save stack space.
  25  *
  26  * Even with no loops unrolled, we still can exceed the 1k stack frame limit
  27  * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
  28  * safely ignore it though, since that the checksum functions will be called
  29  * from a worker thread that won't be using much stack.  That's why we have
  30  * the #pragma here to ignore the warning.
  31  */
  32 #if defined(_ILP32) || defined(__powerpc)       /* Assume small stack */
  33 #pragma GCC diagnostic ignored "-Wframe-larger-than="
  34 /*
  35  * We're running on 32-bit, don't unroll loops to save stack frame space
  36  *
  37  * Due to the ways the calculations on SKEIN_LOOP are done in
  38  * Skein_*_Process_Block(), a value of 111 disables unrolling loops
  39  * in any of those functions.
  40  */
  41 #define SKEIN_LOOP 111
  42 #else
  43 /* We're compiling with large stacks */
  44 #define SKEIN_LOOP 001          /* default: unroll 256 and 512, but not 1024 */
  45 #endif
  46 #endif
  47
  48 /* some useful definitions for code here */
  49 #define BLK_BITS        (WCNT*64)
  50 #define KW_TWK_BASE     (0)
  51 #define KW_KEY_BASE     (3)
  52 #define ks              (kw + KW_KEY_BASE)
  53 #define ts              (kw + KW_TWK_BASE)
  54
  55 /* no debugging in Illumos version */
  56 #define DebugSaveTweak(ctx)
  57
  58 /* Skein_256 */
  59 #if     !(SKEIN_USE_ASM & 256)
  60 void
  61 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
  62     size_t blkCnt, size_t byteCntAdd)
  63 {
  64         enum {
  65                 WCNT = SKEIN_256_STATE_WORDS
  66         };
  67 #undef  RCNT
  68 #define RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
  69
  70 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
  71 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
  72 #else
  73 #define SKEIN_UNROLL_256 (0)
  74 #endif
  75
  76 #if     SKEIN_UNROLL_256
  77 #if     (RCNT % SKEIN_UNROLL_256)
  78 #error "Invalid SKEIN_UNROLL_256"       /* sanity check on unroll count */
  79 #endif
  80         size_t r;
  81         /* key schedule words : chaining vars + tweak + "rotation" */
  82         uint64_t kw[WCNT + 4 + RCNT * 2];
  83 #else
  84         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
  85 #endif
  86         /* local copy of context vars, for speed */
  87         uint64_t X0, X1, X2, X3;
  88         uint64_t w[WCNT];               /* local copy of input block */
  89 #ifdef  SKEIN_DEBUG
  90         /* use for debugging (help compiler put Xn in registers) */
  91         const uint64_t *Xptr[4];
  92         Xptr[0] = &X0;
  93         Xptr[1] = &X1;
  94         Xptr[2] = &X2;
  95         Xptr[3] = &X3;
  96 #endif
  97         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
  98         ts[0] = ctx->h.T[0];
  99         ts[1] = ctx->h.T[1];
 100         do {
 101                 /*
 102                  * this implementation only supports 2**64 input bytes
 103                  * (no carry out here)
 104                  */
 105                 ts[0] += byteCntAdd;    /* update processed length */
 106
 107                 /* precompute the key schedule for this block */
 108                 ks[0] = ctx->X[0];
 109                 ks[1] = ctx->X[1];
 110                 ks[2] = ctx->X[2];
 111                 ks[3] = ctx->X[3];
 112                 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
 113
 114                 ts[2] = ts[0] ^ ts[1];
 115
 116                 /* get input block in little-endian format */
 117                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 118                 DebugSaveTweak(ctx);
 119                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 120
 121                 X0 = w[0] + ks[0];      /* do the first full key injection */
 122                 X1 = w[1] + ks[1] + ts[0];
 123                 X2 = w[2] + ks[2] + ts[1];
 124                 X3 = w[3] + ks[3];
 125
 126                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 127                     Xptr);      /* show starting state values */
 128
 129                 blkPtr += SKEIN_256_BLOCK_BYTES;
 130
 131                 /* run the rounds */
 132
 133 #define Round256(p0, p1, p2, p3, ROT, rNum)                          \
 134         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
 135         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
 136
 137 #if     SKEIN_UNROLL_256 == 0
 138 #define R256(p0, p1, p2, p3, ROT, rNum)         /* fully unrolled */    \
 139         Round256(p0, p1, p2, p3, ROT, rNum)             \
 140         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 141
 142 #define I256(R)                                                         \
 143         X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
 144         X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];                    \
 145         X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];                    \
 146         X3 += ks[((R) + 4) % 5] + (R) + 1;                      \
 147         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 148 #else                           /* looping version */
 149 #define R256(p0, p1, p2, p3, ROT, rNum)                             \
 150         Round256(p0, p1, p2, p3, ROT, rNum)                             \
 151         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 152
 153 #define I256(R)                                                         \
 154         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 155         X1 += ks[r + (R) + 1] + ts[r + (R) + 0];                        \
 156         X2 += ks[r + (R) + 2] + ts[r + (R) + 1];                        \
 157         X3 += ks[r + (R) + 3] + r + (R);                                \
 158         ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */  \
 159         ts[r + (R) + 2] = ts[r + (R) - 1];                      \
 160         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 161
 162                 /* loop thru it */
 163                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
 164 #endif
 165                 {
 166 #define R256_8_rounds(R)                         \
 167         R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
 168         R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
 169         R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
 170         R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
 171         I256(2 * (R));                           \
 172         R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
 173         R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
 174         R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
 175         R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
 176         I256(2 * (R) + 1);
 177
 178                         R256_8_rounds(0);
 179
 180 #define R256_Unroll_R(NN) \
 181         ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
 182         (SKEIN_UNROLL_256 > (NN)))
 183
 184 #if     R256_Unroll_R(1)
 185                         R256_8_rounds(1);
 186 #endif
 187 #if     R256_Unroll_R(2)
 188                         R256_8_rounds(2);
 189 #endif
 190 #if     R256_Unroll_R(3)
 191                         R256_8_rounds(3);
 192 #endif
 193 #if     R256_Unroll_R(4)
 194                         R256_8_rounds(4);
 195 #endif
 196 #if     R256_Unroll_R(5)
 197                         R256_8_rounds(5);
 198 #endif
 199 #if     R256_Unroll_R(6)
 200                         R256_8_rounds(6);
 201 #endif
 202 #if     R256_Unroll_R(7)
 203                         R256_8_rounds(7);
 204 #endif
 205 #if     R256_Unroll_R(8)
 206                         R256_8_rounds(8);
 207 #endif
 208 #if     R256_Unroll_R(9)
 209                         R256_8_rounds(9);
 210 #endif
 211 #if     R256_Unroll_R(10)
 212                         R256_8_rounds(10);
 213 #endif
 214 #if     R256_Unroll_R(11)
 215                         R256_8_rounds(11);
 216 #endif
 217 #if     R256_Unroll_R(12)
 218                         R256_8_rounds(12);
 219 #endif
 220 #if     R256_Unroll_R(13)
 221                         R256_8_rounds(13);
 222 #endif
 223 #if     R256_Unroll_R(14)
 224                         R256_8_rounds(14);
 225 #endif
 226 #if     (SKEIN_UNROLL_256 > 14)
 227 #error  "need more unrolling in Skein_256_Process_Block"
 228 #endif
 229                 }
 230                 /*
 231                  * do the final "feedforward" xor, update context chaining vars
 232                  */
 233                 ctx->X[0] = X0 ^ w[0];
 234                 ctx->X[1] = X1 ^ w[1];
 235                 ctx->X[2] = X2 ^ w[2];
 236                 ctx->X[3] = X3 ^ w[3];
 237
 238                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 239
 240                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 241         } while (--blkCnt);
 242         ctx->h.T[0] = ts[0];
 243         ctx->h.T[1] = ts[1];
 244 }
 245
 246 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 247 size_t
 248 Skein_256_Process_Block_CodeSize(void)
 249 {
 250         return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
 251             ((uint8_t *)Skein_256_Process_Block);
 252 }
 253
 254 uint_t
 255 Skein_256_Unroll_Cnt(void)
 256 {
 257         return (SKEIN_UNROLL_256);
 258 }
 259 #endif
 260 #endif
 261
 262 /* Skein_512 */
 263 #if     !(SKEIN_USE_ASM & 512)
 264 void
 265 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
 266     size_t blkCnt, size_t byteCntAdd)
 267 {
 268         enum {
 269                 WCNT = SKEIN_512_STATE_WORDS
 270         };
 271 #undef  RCNT
 272 #define RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
 273
 274 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 275 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
 276 #else
 277 #define SKEIN_UNROLL_512 (0)
 278 #endif
 279
 280 #if     SKEIN_UNROLL_512
 281 #if     (RCNT % SKEIN_UNROLL_512)
 282 #error "Invalid SKEIN_UNROLL_512"       /* sanity check on unroll count */
 283 #endif
 284         size_t r;
 285         /* key schedule words : chaining vars + tweak + "rotation" */
 286         uint64_t kw[WCNT + 4 + RCNT * 2];
 287 #else
 288         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 289 #endif
 290         /* local copy of vars, for speed */
 291         uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
 292         uint64_t w[WCNT];               /* local copy of input block */
 293 #ifdef  SKEIN_DEBUG
 294         /* use for debugging (help compiler put Xn in registers) */
 295         const uint64_t *Xptr[8];
 296         Xptr[0] = &X0;
 297         Xptr[1] = &X1;
 298         Xptr[2] = &X2;
 299         Xptr[3] = &X3;
 300         Xptr[4] = &X4;
 301         Xptr[5] = &X5;
 302         Xptr[6] = &X6;
 303         Xptr[7] = &X7;
 304 #endif
 305
 306         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 307         ts[0] = ctx->h.T[0];
 308         ts[1] = ctx->h.T[1];
 309         do {
 310                 /*
 311                  * this implementation only supports 2**64 input bytes
 312                  * (no carry out here)
 313                  */
 314                 ts[0] += byteCntAdd;    /* update processed length */
 315
 316                 /* precompute the key schedule for this block */
 317                 ks[0] = ctx->X[0];
 318                 ks[1] = ctx->X[1];
 319                 ks[2] = ctx->X[2];
 320                 ks[3] = ctx->X[3];
 321                 ks[4] = ctx->X[4];
 322                 ks[5] = ctx->X[5];
 323                 ks[6] = ctx->X[6];
 324                 ks[7] = ctx->X[7];
 325                 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 326                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
 327
 328                 ts[2] = ts[0] ^ ts[1];
 329
 330                 /* get input block in little-endian format */
 331                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 332                 DebugSaveTweak(ctx);
 333                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 334
 335                 X0 = w[0] + ks[0];      /* do the first full key injection */
 336                 X1 = w[1] + ks[1];
 337                 X2 = w[2] + ks[2];
 338                 X3 = w[3] + ks[3];
 339                 X4 = w[4] + ks[4];
 340                 X5 = w[5] + ks[5] + ts[0];
 341                 X6 = w[6] + ks[6] + ts[1];
 342                 X7 = w[7] + ks[7];
 343
 344                 blkPtr += SKEIN_512_BLOCK_BYTES;
 345
 346                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 347                     Xptr);
 348                 /* run the rounds */
 349 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 350         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 351         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 352         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 353         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
 354
 355 #if     SKEIN_UNROLL_512 == 0
 356 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */  \
 357         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 358         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
 359
 360 #define I512(R)                                                         \
 361         X0 += ks[((R) + 1) % 9];        /* inject the key schedule value */\
 362         X1 += ks[((R) + 2) % 9];                                        \
 363         X2 += ks[((R) + 3) % 9];                                        \
 364         X3 += ks[((R) + 4) % 9];                                        \
 365         X4 += ks[((R) + 5) % 9];                                        \
 366         X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];                    \
 367         X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];                    \
 368         X7 += ks[((R) + 8) % 9] + (R) + 1;                              \
 369         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 370 #else                           /* looping version */
 371 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)                 \
 372         Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)             \
 373         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
 374
 375 #define I512(R)                                                         \
 376         X0 += ks[r + (R) + 0];  /* inject the key schedule value */     \
 377         X1 += ks[r + (R) + 1];                                          \
 378         X2 += ks[r + (R) + 2];                                          \
 379         X3 += ks[r + (R) + 3];                                          \
 380         X4 += ks[r + (R) + 4];                                          \
 381         X5 += ks[r + (R) + 5] + ts[r + (R) + 0];                        \
 382         X6 += ks[r + (R) + 6] + ts[r + (R) + 1];                        \
 383         X7 += ks[r + (R) + 7] + r + (R);                                \
 384         ks[r + (R)+8] = ks[r + (R) - 1];        /* rotate key schedule */\
 385         ts[r + (R)+2] = ts[r + (R) - 1];                                \
 386         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 387
 388                 /* loop thru it */
 389                 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
 390 #endif                          /* end of looped code definitions */
 391                 {
 392 #define R512_8_rounds(R)        /* do 8 full rounds */                  \
 393         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);             \
 394         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);             \
 395         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);             \
 396         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);             \
 397         I512(2 * (R));                                                  \
 398         R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);             \
 399         R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);             \
 400         R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);             \
 401         R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);             \
 402         I512(2*(R) + 1);                /* and key injection */
 403
 404                         R512_8_rounds(0);
 405
 406 #define R512_Unroll_R(NN) \
 407         ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
 408         (SKEIN_UNROLL_512 > (NN)))
 409
 410 #if     R512_Unroll_R(1)
 411                         R512_8_rounds(1);
 412 #endif
 413 #if     R512_Unroll_R(2)
 414                         R512_8_rounds(2);
 415 #endif
 416 #if     R512_Unroll_R(3)
 417                         R512_8_rounds(3);
 418 #endif
 419 #if     R512_Unroll_R(4)
 420                         R512_8_rounds(4);
 421 #endif
 422 #if     R512_Unroll_R(5)
 423                         R512_8_rounds(5);
 424 #endif
 425 #if     R512_Unroll_R(6)
 426                         R512_8_rounds(6);
 427 #endif
 428 #if     R512_Unroll_R(7)
 429                         R512_8_rounds(7);
 430 #endif
 431 #if     R512_Unroll_R(8)
 432                         R512_8_rounds(8);
 433 #endif
 434 #if     R512_Unroll_R(9)
 435                         R512_8_rounds(9);
 436 #endif
 437 #if     R512_Unroll_R(10)
 438                         R512_8_rounds(10);
 439 #endif
 440 #if     R512_Unroll_R(11)
 441                         R512_8_rounds(11);
 442 #endif
 443 #if     R512_Unroll_R(12)
 444                         R512_8_rounds(12);
 445 #endif
 446 #if     R512_Unroll_R(13)
 447                         R512_8_rounds(13);
 448 #endif
 449 #if     R512_Unroll_R(14)
 450                         R512_8_rounds(14);
 451 #endif
 452 #if     (SKEIN_UNROLL_512 > 14)
 453 #error "need more unrolling in Skein_512_Process_Block"
 454 #endif
 455                 }
 456
 457                 /*
 458                  * do the final "feedforward" xor, update context chaining vars
 459                  */
 460                 ctx->X[0] = X0 ^ w[0];
 461                 ctx->X[1] = X1 ^ w[1];
 462                 ctx->X[2] = X2 ^ w[2];
 463                 ctx->X[3] = X3 ^ w[3];
 464                 ctx->X[4] = X4 ^ w[4];
 465                 ctx->X[5] = X5 ^ w[5];
 466                 ctx->X[6] = X6 ^ w[6];
 467                 ctx->X[7] = X7 ^ w[7];
 468                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 469
 470                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 471         } while (--blkCnt);
 472         ctx->h.T[0] = ts[0];
 473         ctx->h.T[1] = ts[1];
 474 }
 475
 476 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 477 size_t
 478 Skein_512_Process_Block_CodeSize(void)
 479 {
 480         return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
 481             ((uint8_t *)Skein_512_Process_Block);
 482 }
 483
 484 uint_t
 485 Skein_512_Unroll_Cnt(void)
 486 {
 487         return (SKEIN_UNROLL_512);
 488 }
 489 #endif
 490 #endif
 491
 492 /*  Skein1024 */
 493 #if     !(SKEIN_USE_ASM & 1024)
 494 void
 495 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
 496     size_t blkCnt, size_t byteCntAdd)
 497 {
 498         /* do it in C, always looping (unrolled is bigger AND slower!) */
 499         enum {
 500                 WCNT = SKEIN1024_STATE_WORDS
 501         };
 502 #undef  RCNT
 503 #define RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
 504
 505 #ifdef  SKEIN_LOOP              /* configure how much to unroll the loop */
 506 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
 507 #else
 508 #define SKEIN_UNROLL_1024 (0)
 509 #endif
 510
 511 #if     (SKEIN_UNROLL_1024 != 0)
 512 #if     (RCNT % SKEIN_UNROLL_1024)
 513 #error "Invalid SKEIN_UNROLL_1024"      /* sanity check on unroll count */
 514 #endif
 515         size_t r;
 516         /* key schedule words : chaining vars + tweak + "rotation" */
 517         uint64_t kw[WCNT + 4 + RCNT * 2];
 518 #else
 519         uint64_t kw[WCNT + 4];  /* key schedule words : chaining vars + tweak */
 520 #endif
 521
 522         /* local copy of vars, for speed */
 523         uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
 524             X12, X13, X14, X15;
 525         uint64_t w[WCNT];               /* local copy of input block */
 526 #ifdef  SKEIN_DEBUG
 527         /* use for debugging (help compiler put Xn in registers) */
 528         const uint64_t *Xptr[16];
 529         Xptr[0] = &X00;
 530         Xptr[1] = &X01;
 531         Xptr[2] = &X02;
 532         Xptr[3] = &X03;
 533         Xptr[4] = &X04;
 534         Xptr[5] = &X05;
 535         Xptr[6] = &X06;
 536         Xptr[7] = &X07;
 537         Xptr[8] = &X08;
 538         Xptr[9] = &X09;
 539         Xptr[10] = &X10;
 540         Xptr[11] = &X11;
 541         Xptr[12] = &X12;
 542         Xptr[13] = &X13;
 543         Xptr[14] = &X14;
 544         Xptr[15] = &X15;
 545 #endif
 546
 547         Skein_assert(blkCnt != 0);      /* never call with blkCnt == 0! */
 548         ts[0] = ctx->h.T[0];
 549         ts[1] = ctx->h.T[1];
 550         do {
 551                 /*
 552                  * this implementation only supports 2**64 input bytes
 553                  * (no carry out here)
 554                  */
 555                 ts[0] += byteCntAdd;    /* update processed length */
 556
 557                 /* precompute the key schedule for this block */
 558                 ks[0] = ctx->X[0];
 559                 ks[1] = ctx->X[1];
 560                 ks[2] = ctx->X[2];
 561                 ks[3] = ctx->X[3];
 562                 ks[4] = ctx->X[4];
 563                 ks[5] = ctx->X[5];
 564                 ks[6] = ctx->X[6];
 565                 ks[7] = ctx->X[7];
 566                 ks[8] = ctx->X[8];
 567                 ks[9] = ctx->X[9];
 568                 ks[10] = ctx->X[10];
 569                 ks[11] = ctx->X[11];
 570                 ks[12] = ctx->X[12];
 571                 ks[13] = ctx->X[13];
 572                 ks[14] = ctx->X[14];
 573                 ks[15] = ctx->X[15];
 574                 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
 575                     ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
 576                     ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
 577                     ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
 578
 579                 ts[2] = ts[0] ^ ts[1];
 580
 581                 /* get input block in little-endian format */
 582                 Skein_Get64_LSB_First(w, blkPtr, WCNT);
 583                 DebugSaveTweak(ctx);
 584                 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
 585
 586                 X00 = w[0] + ks[0];     /* do the first full key injection */
 587                 X01 = w[1] + ks[1];
 588                 X02 = w[2] + ks[2];
 589                 X03 = w[3] + ks[3];
 590                 X04 = w[4] + ks[4];
 591                 X05 = w[5] + ks[5];
 592                 X06 = w[6] + ks[6];
 593                 X07 = w[7] + ks[7];
 594                 X08 = w[8] + ks[8];
 595                 X09 = w[9] + ks[9];
 596                 X10 = w[10] + ks[10];
 597                 X11 = w[11] + ks[11];
 598                 X12 = w[12] + ks[12];
 599                 X13 = w[13] + ks[13] + ts[0];
 600                 X14 = w[14] + ks[14] + ts[1];
 601                 X15 = w[15] + ks[15];
 602
 603                 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
 604                     Xptr);
 605
 606 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 607         pD, pE, pF, ROT, rNum)                                          \
 608         X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
 609         X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
 610         X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
 611         X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
 612         X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
 613         X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
 614         X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
 615         X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
 616
 617 #if     SKEIN_UNROLL_1024 == 0
 618 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 619         pE, pF, ROT, rn)                                                \
 620         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 621         pD, pE, pF, ROT, rn)                                            \
 622         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
 623
 624 #define I1024(R)                                                        \
 625         X00 += ks[((R) + 1) % 17];      /* inject the key schedule value */\
 626         X01 += ks[((R) + 2) % 17];                                      \
 627         X02 += ks[((R) + 3) % 17];                                      \
 628         X03 += ks[((R) + 4) % 17];                                      \
 629         X04 += ks[((R) + 5) % 17];                                      \
 630         X05 += ks[((R) + 6) % 17];                                      \
 631         X06 += ks[((R) + 7) % 17];                                      \
 632         X07 += ks[((R) + 8) % 17];                                      \
 633         X08 += ks[((R) + 9) % 17];                                      \
 634         X09 += ks[((R) + 10) % 17];                                     \
 635         X10 += ks[((R) + 11) % 17];                                     \
 636         X11 += ks[((R) + 12) % 17];                                     \
 637         X12 += ks[((R) + 13) % 17];                                     \
 638         X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];                 \
 639         X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];                 \
 640         X15 += ks[((R) + 16) % 17] + (R) +1;                            \
 641         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 642 #else                           /* looping version */
 643 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,   \
 644         pE, pF, ROT, rn)                                                \
 645         Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,   \
 646         pD, pE, pF, ROT, rn)                                            \
 647         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
 648
 649 #define I1024(R)                                                        \
 650         X00 += ks[r + (R) + 0]; /* inject the key schedule value */     \
 651         X01 += ks[r + (R) + 1];                                         \
 652         X02 += ks[r + (R) + 2];                                         \
 653         X03 += ks[r + (R) + 3];                                         \
 654         X04 += ks[r + (R) + 4];                                         \
 655         X05 += ks[r + (R) + 5];                                         \
 656         X06 += ks[r + (R) + 6];                                         \
 657         X07 += ks[r + (R) + 7];                                         \
 658         X08 += ks[r + (R) + 8];                                         \
 659         X09 += ks[r + (R) + 9];                                         \
 660         X10 += ks[r + (R) + 10];                                        \
 661         X11 += ks[r + (R) + 11];                                        \
 662         X12 += ks[r + (R) + 12];                                        \
 663         X13 += ks[r + (R) + 13] + ts[r + (R) + 0];                      \
 664         X14 += ks[r + (R) + 14] + ts[r + (R) + 1];                      \
 665         X15 += ks[r + (R) + 15] +  r + (R);                             \
 666         ks[r + (R) + 16] = ks[r + (R) - 1];     /* rotate key schedule */\
 667         ts[r + (R) + 2] = ts[r + (R) - 1];                              \
 668         Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
 669
 670                 /* loop thru it */
 671                 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
 672 #endif
 673                 {
 674 #define R1024_8_rounds(R)       /* do 8 full rounds */                  \
 675         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 676             14, 15, R1024_0, 8 * (R) + 1);                              \
 677         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 678             08, 01, R1024_1, 8 * (R) + 2);                              \
 679         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 680             10, 09, R1024_2, 8 * (R) + 3);                              \
 681         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 682             12, 07, R1024_3, 8 * (R) + 4);                              \
 683         I1024(2 * (R));                                                 \
 684         R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,   \
 685             14, 15, R1024_4, 8 * (R) + 5);                              \
 686         R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,   \
 687             08, 01, R1024_5, 8 * (R) + 6);                              \
 688         R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,   \
 689             10, 09, R1024_6, 8 * (R) + 7);                              \
 690         R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,   \
 691             12, 07, R1024_7, 8 * (R) + 8);                              \
 692         I1024(2 * (R) + 1);
 693
 694                         R1024_8_rounds(0);
 695
 696 #define R1024_Unroll_R(NN)                                              \
 697         ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \
 698         (SKEIN_UNROLL_1024 > (NN)))
 699
 700 #if     R1024_Unroll_R(1)
 701                         R1024_8_rounds(1);
 702 #endif
 703 #if     R1024_Unroll_R(2)
 704                         R1024_8_rounds(2);
 705 #endif
 706 #if     R1024_Unroll_R(3)
 707                         R1024_8_rounds(3);
 708 #endif
 709 #if     R1024_Unroll_R(4)
 710                         R1024_8_rounds(4);
 711 #endif
 712 #if     R1024_Unroll_R(5)
 713                         R1024_8_rounds(5);
 714 #endif
 715 #if     R1024_Unroll_R(6)
 716                         R1024_8_rounds(6);
 717 #endif
 718 #if     R1024_Unroll_R(7)
 719                         R1024_8_rounds(7);
 720 #endif
 721 #if     R1024_Unroll_R(8)
 722                         R1024_8_rounds(8);
 723 #endif
 724 #if     R1024_Unroll_R(9)
 725                         R1024_8_rounds(9);
 726 #endif
 727 #if     R1024_Unroll_R(10)
 728                         R1024_8_rounds(10);
 729 #endif
 730 #if     R1024_Unroll_R(11)
 731                         R1024_8_rounds(11);
 732 #endif
 733 #if     R1024_Unroll_R(12)
 734                         R1024_8_rounds(12);
 735 #endif
 736 #if     R1024_Unroll_R(13)
 737                         R1024_8_rounds(13);
 738 #endif
 739 #if     R1024_Unroll_R(14)
 740                         R1024_8_rounds(14);
 741 #endif
 742 #if     (SKEIN_UNROLL_1024 > 14)
 743 #error  "need more unrolling in Skein_1024_Process_Block"
 744 #endif
 745                 }
 746                 /*
 747                  * do the final "feedforward" xor, update context chaining vars
 748                  */
 749
 750                 ctx->X[0] = X00 ^ w[0];
 751                 ctx->X[1] = X01 ^ w[1];
 752                 ctx->X[2] = X02 ^ w[2];
 753                 ctx->X[3] = X03 ^ w[3];
 754                 ctx->X[4] = X04 ^ w[4];
 755                 ctx->X[5] = X05 ^ w[5];
 756                 ctx->X[6] = X06 ^ w[6];
 757                 ctx->X[7] = X07 ^ w[7];
 758                 ctx->X[8] = X08 ^ w[8];
 759                 ctx->X[9] = X09 ^ w[9];
 760                 ctx->X[10] = X10 ^ w[10];
 761                 ctx->X[11] = X11 ^ w[11];
 762                 ctx->X[12] = X12 ^ w[12];
 763                 ctx->X[13] = X13 ^ w[13];
 764                 ctx->X[14] = X14 ^ w[14];
 765                 ctx->X[15] = X15 ^ w[15];
 766
 767                 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
 768
 769                 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
 770                 blkPtr += SKEIN1024_BLOCK_BYTES;
 771         } while (--blkCnt);
 772         ctx->h.T[0] = ts[0];
 773         ctx->h.T[1] = ts[1];
 774 }
 775
 776 #if     defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
 777 size_t
 778 Skein1024_Process_Block_CodeSize(void)
 779 {
 780         return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
 781             ((uint8_t *)Skein1024_Process_Block);
 782 }
 783
 784 uint_t
 785 Skein1024_Unroll_Cnt(void)
 786 {
 787         return (SKEIN_UNROLL_1024);
 788 }
 789 #endif
 790 #endif