arch/x86/crypto/sha1_avx2_x86_64_asm.S

   1 /*
   2  *      Implement fast SHA-1 with AVX2 instructions. (x86_64)
   3  *
   4  * This file is provided under a dual BSD/GPLv2 license.  When using or
   5  * redistributing this file, you may do so under either license.
   6  *
   7  * GPL LICENSE SUMMARY
   8  *
   9  * Copyright(c) 2014 Intel Corporation.
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of version 2 of the GNU General Public License as
  13  * published by the Free Software Foundation.
  14  *
  15  * This program is distributed in the hope that it will be useful, but
  16  * WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * General Public License for more details.
  19  *
  20  * Contact Information:
  21  * Ilya Albrekht <ilya.albrekht@intel.com>
  22  * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  23  * Ronen Zohar <ronen.zohar@intel.com>
  24  * Chandramouli Narayanan <mouli@linux.intel.com>
  25  *
  26  * BSD LICENSE
  27  *
  28  * Copyright(c) 2014 Intel Corporation.
  29  *
  30  * Redistribution and use in source and binary forms, with or without
  31  * modification, are permitted provided that the following conditions
  32  * are met:
  33  *
  34  * Redistributions of source code must retain the above copyright
  35  * notice, this list of conditions and the following disclaimer.
  36  * Redistributions in binary form must reproduce the above copyright
  37  * notice, this list of conditions and the following disclaimer in
  38  * the documentation and/or other materials provided with the
  39  * distribution.
  40  * Neither the name of Intel Corporation nor the names of its
  41  * contributors may be used to endorse or promote products derived
  42  * from this software without specific prior written permission.
  43  *
  44  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  45  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  46  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  47  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  48  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  49  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  50  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  54  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55  *
  56  */
  57
  58 /*
  59  * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
  60  *
  61  *This implementation is based on the previous SSSE3 release:
  62  *Visit http://software.intel.com/en-us/articles/
  63  *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
  64  *
  65  *Updates 20-byte SHA-1 record in 'hash' for even number of
  66  *'num_blocks' consecutive 64-byte blocks
  67  *
  68  *extern "C" void sha1_transform_avx2(
  69  *      int *hash, const char* input, size_t num_blocks );
  70  */
  71
  72 #include <linux/linkage.h>
  73
  74 #define CTX     %rdi    /* arg1 */
  75 #define BUF     %rsi    /* arg2 */
  76 #define CNT     %rdx    /* arg3 */
  77
  78 #define REG_A   %ecx
  79 #define REG_B   %esi
  80 #define REG_C   %edi
  81 #define REG_D   %eax
  82 #define REG_E   %edx
  83 #define REG_TB  %ebx
  84 #define REG_TA  %r12d
  85 #define REG_RA  %rcx
  86 #define REG_RB  %rsi
  87 #define REG_RC  %rdi
  88 #define REG_RD  %rax
  89 #define REG_RE  %rdx
  90 #define REG_RTA %r12
  91 #define REG_RTB %rbx
  92 #define REG_T1  %ebp
  93 #define xmm_mov vmovups
  94 #define avx2_zeroupper  vzeroupper
  95 #define RND_F1  1
  96 #define RND_F2  2
  97 #define RND_F3  3
  98
  99 .macro REGALLOC
 100         .set A, REG_A
 101         .set B, REG_B
 102         .set C, REG_C
 103         .set D, REG_D
 104         .set E, REG_E
 105         .set TB, REG_TB
 106         .set TA, REG_TA
 107
 108         .set RA, REG_RA
 109         .set RB, REG_RB
 110         .set RC, REG_RC
 111         .set RD, REG_RD
 112         .set RE, REG_RE
 113
 114         .set RTA, REG_RTA
 115         .set RTB, REG_RTB
 116
 117         .set T1, REG_T1
 118 .endm
 119
 120 #define K_BASE          %r8
 121 #define HASH_PTR        %r9
 122 #define BUFFER_PTR      %r10
 123 #define BUFFER_PTR2     %r13
 124 #define BUFFER_END      %r11
 125
 126 #define PRECALC_BUF     %r14
 127 #define WK_BUF          %r15
 128
 129 #define W_TMP           %xmm0
 130 #define WY_TMP          %ymm0
 131 #define WY_TMP2         %ymm9
 132
 133 # AVX2 variables
 134 #define WY0             %ymm3
 135 #define WY4             %ymm5
 136 #define WY08            %ymm7
 137 #define WY12            %ymm8
 138 #define WY16            %ymm12
 139 #define WY20            %ymm13
 140 #define WY24            %ymm14
 141 #define WY28            %ymm15
 142
 143 #define YMM_SHUFB_BSWAP %ymm10
 144
 145 /*
 146  * Keep 2 iterations precalculated at a time:
 147  *    - 80 DWORDs per iteration * 2
 148  */
 149 #define W_SIZE          (80*2*2 +16)
 150
 151 #define WK(t)   ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
 152 #define PRECALC_WK(t)   ((t)*2*2)(PRECALC_BUF)
 153
 154
 155 .macro UPDATE_HASH  hash, val
 156         add     \hash, \val
 157         mov     \val, \hash
 158 .endm
 159
 160 .macro PRECALC_RESET_WY
 161         .set WY_00, WY0
 162         .set WY_04, WY4
 163         .set WY_08, WY08
 164         .set WY_12, WY12
 165         .set WY_16, WY16
 166         .set WY_20, WY20
 167         .set WY_24, WY24
 168         .set WY_28, WY28
 169         .set WY_32, WY_00
 170 .endm
 171
 172 .macro PRECALC_ROTATE_WY
 173         /* Rotate macros */
 174         .set WY_32, WY_28
 175         .set WY_28, WY_24
 176         .set WY_24, WY_20
 177         .set WY_20, WY_16
 178         .set WY_16, WY_12
 179         .set WY_12, WY_08
 180         .set WY_08, WY_04
 181         .set WY_04, WY_00
 182         .set WY_00, WY_32
 183
 184         /* Define register aliases */
 185         .set WY, WY_00
 186         .set WY_minus_04, WY_04
 187         .set WY_minus_08, WY_08
 188         .set WY_minus_12, WY_12
 189         .set WY_minus_16, WY_16
 190         .set WY_minus_20, WY_20
 191         .set WY_minus_24, WY_24
 192         .set WY_minus_28, WY_28
 193         .set WY_minus_32, WY
 194 .endm
 195
 196 .macro PRECALC_00_15
 197         .if (i == 0) # Initialize and rotate registers
 198                 PRECALC_RESET_WY
 199                 PRECALC_ROTATE_WY
 200         .endif
 201
 202         /* message scheduling pre-compute for rounds 0-15 */
 203         .if   ((i & 7) == 0)
 204                 /*
 205                  * blended AVX2 and ALU instruction scheduling
 206                  * 1 vector iteration per 8 rounds
 207                  */
 208                 vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
 209         .elseif ((i & 7) == 1)
 210                 vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
 211                          WY_TMP, WY_TMP
 212         .elseif ((i & 7) == 2)
 213                 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
 214         .elseif ((i & 7) == 4)
 215                 vpaddd  K_XMM(K_BASE), WY, WY_TMP
 216         .elseif ((i & 7) == 7)
 217                 vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 218
 219                 PRECALC_ROTATE_WY
 220         .endif
 221 .endm
 222
 223 .macro PRECALC_16_31
 224         /*
 225          * message scheduling pre-compute for rounds 16-31
 226          * calculating last 32 w[i] values in 8 XMM registers
 227          * pre-calculate K+w[i] values and store to mem
 228          * for later load by ALU add instruction
 229          *
 230          * "brute force" vectorization for rounds 16-31 only
 231          * due to w[i]->w[i-3] dependency
 232          */
 233         .if   ((i & 7) == 0)
 234                 /*
 235                  * blended AVX2 and ALU instruction scheduling
 236                  * 1 vector iteration per 8 rounds
 237                  */
 238                 /* w[i-14] */
 239                 vpalignr        $8, WY_minus_16, WY_minus_12, WY
 240                 vpsrldq $4, WY_minus_04, WY_TMP               /* w[i-3] */
 241         .elseif ((i & 7) == 1)
 242                 vpxor   WY_minus_08, WY, WY
 243                 vpxor   WY_minus_16, WY_TMP, WY_TMP
 244         .elseif ((i & 7) == 2)
 245                 vpxor   WY_TMP, WY, WY
 246                 vpslldq $12, WY, WY_TMP2
 247         .elseif ((i & 7) == 3)
 248                 vpslld  $1, WY, WY_TMP
 249                 vpsrld  $31, WY, WY
 250         .elseif ((i & 7) == 4)
 251                 vpor    WY, WY_TMP, WY_TMP
 252                 vpslld  $2, WY_TMP2, WY
 253         .elseif ((i & 7) == 5)
 254                 vpsrld  $30, WY_TMP2, WY_TMP2
 255                 vpxor   WY, WY_TMP, WY_TMP
 256         .elseif ((i & 7) == 7)
 257                 vpxor   WY_TMP2, WY_TMP, WY
 258                 vpaddd  K_XMM(K_BASE), WY, WY_TMP
 259                 vmovdqu WY_TMP, PRECALC_WK(i&~7)
 260
 261                 PRECALC_ROTATE_WY
 262         .endif
 263 .endm
 264
 265 .macro PRECALC_32_79
 266         /*
 267          * in SHA-1 specification:
 268          * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 269          * instead we do equal:
 270          * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 271          * allows more efficient vectorization
 272          * since w[i]=>w[i-3] dependency is broken
 273          */
 274
 275         .if   ((i & 7) == 0)
 276         /*
 277          * blended AVX2 and ALU instruction scheduling
 278          * 1 vector iteration per 8 rounds
 279          */
 280                 vpalignr        $8, WY_minus_08, WY_minus_04, WY_TMP
 281         .elseif ((i & 7) == 1)
 282                 /* W is W_minus_32 before xor */
 283                 vpxor   WY_minus_28, WY, WY
 284         .elseif ((i & 7) == 2)
 285                 vpxor   WY_minus_16, WY_TMP, WY_TMP
 286         .elseif ((i & 7) == 3)
 287                 vpxor   WY_TMP, WY, WY
 288         .elseif ((i & 7) == 4)
 289                 vpslld  $2, WY, WY_TMP
 290         .elseif ((i & 7) == 5)
 291                 vpsrld  $30, WY, WY
 292                 vpor    WY, WY_TMP, WY
 293         .elseif ((i & 7) == 7)
 294                 vpaddd  K_XMM(K_BASE), WY, WY_TMP
 295                 vmovdqu WY_TMP, PRECALC_WK(i&~7)
 296
 297                 PRECALC_ROTATE_WY
 298         .endif
 299 .endm
 300
 301 .macro PRECALC r, s
 302         .set i, \r
 303
 304         .if (i < 40)
 305                 .set K_XMM, 32*0
 306         .elseif (i < 80)
 307                 .set K_XMM, 32*1
 308         .elseif (i < 120)
 309                 .set K_XMM, 32*2
 310         .else
 311                 .set K_XMM, 32*3
 312         .endif
 313
 314         .if (i<32)
 315                 PRECALC_00_15   \s
 316         .elseif (i<64)
 317                 PRECALC_16_31   \s
 318         .elseif (i < 160)
 319                 PRECALC_32_79   \s
 320         .endif
 321 .endm
 322
 323 .macro ROTATE_STATE
 324         .set T_REG, E
 325         .set E, D
 326         .set D, C
 327         .set C, B
 328         .set B, TB
 329         .set TB, A
 330         .set A, T_REG
 331
 332         .set T_REG, RE
 333         .set RE, RD
 334         .set RD, RC
 335         .set RC, RB
 336         .set RB, RTB
 337         .set RTB, RA
 338         .set RA, T_REG
 339 .endm
 340
 341 /* Macro relies on saved ROUND_Fx */
 342
 343 .macro RND_FUN f, r
 344         .if (\f == RND_F1)
 345                 ROUND_F1        \r
 346         .elseif (\f == RND_F2)
 347                 ROUND_F2        \r
 348         .elseif (\f == RND_F3)
 349                 ROUND_F3        \r
 350         .endif
 351 .endm
 352
 353 .macro RR r
 354         .set round_id, (\r % 80)
 355
 356         .if (round_id == 0)        /* Precalculate F for first round */
 357                 .set ROUND_FUNC, RND_F1
 358                 mov     B, TB
 359
 360                 rorx    $(32-30), B, B    /* b>>>2 */
 361                 andn    D, TB, T1
 362                 and     C, TB
 363                 xor     T1, TB
 364         .endif
 365
 366         RND_FUN ROUND_FUNC, \r
 367         ROTATE_STATE
 368
 369         .if   (round_id == 18)
 370                 .set ROUND_FUNC, RND_F2
 371         .elseif (round_id == 38)
 372                 .set ROUND_FUNC, RND_F3
 373         .elseif (round_id == 58)
 374                 .set ROUND_FUNC, RND_F2
 375         .endif
 376
 377         .set round_id, ( (\r+1) % 80)
 378
 379         RND_FUN ROUND_FUNC, (\r+1)
 380         ROTATE_STATE
 381 .endm
 382
 383 .macro ROUND_F1 r
 384         add     WK(\r), E
 385
 386         andn    C, A, T1                        /* ~b&d */
 387         lea     (RE,RTB), E             /* Add F from the previous round */
 388
 389         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 390         rorx    $(32-30),A, TB          /* b>>>2 for next round */
 391
 392         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 393
 394         /*
 395          * Calculate F for the next round
 396          * (b & c) ^ andn[b, d]
 397          */
 398         and     B, A                    /* b&c */
 399         xor     T1, A                   /* F1 = (b&c) ^ (~b&d) */
 400
 401         lea     (RE,RTA), E             /* E += A >>> 5 */
 402 .endm
 403
 404 .macro ROUND_F2 r
 405         add     WK(\r), E
 406         lea     (RE,RTB), E             /* Add F from the previous round */
 407
 408         /* Calculate F for the next round */
 409         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 410         .if ((round_id) < 79)
 411                 rorx    $(32-30), A, TB /* b>>>2 for next round */
 412         .endif
 413         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 414
 415         .if ((round_id) < 79)
 416                 xor     B, A
 417         .endif
 418
 419         add     TA, E                   /* E += A >>> 5 */
 420
 421         .if ((round_id) < 79)
 422                 xor     C, A
 423         .endif
 424 .endm
 425
 426 .macro ROUND_F3 r
 427         add     WK(\r), E
 428         PRECALC (\r)                    /* msg scheduling for next 2 blocks */
 429
 430         lea     (RE,RTB), E             /* Add F from the previous round */
 431
 432         mov     B, T1
 433         or      A, T1
 434
 435         rorx    $(32-5), A, TA          /* T2 = A >>> 5 */
 436         rorx    $(32-30), A, TB         /* b>>>2 for next round */
 437
 438         /* Calculate F for the next round
 439          * (b and c) or (d and (b or c))
 440          */
 441         and     C, T1
 442         and     B, A
 443         or      T1, A
 444
 445         add     TA, E                   /* E += A >>> 5 */
 446
 447 .endm
 448
 449 /*
 450  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
 451  */
 452 .macro SHA1_PIPELINED_MAIN_BODY
 453
 454         REGALLOC
 455
 456         mov     (HASH_PTR), A
 457         mov     4(HASH_PTR), B
 458         mov     8(HASH_PTR), C
 459         mov     12(HASH_PTR), D
 460         mov     16(HASH_PTR), E
 461
 462         mov     %rsp, PRECALC_BUF
 463         lea     (2*4*80+32)(%rsp), WK_BUF
 464
 465         # Precalc WK for first 2 blocks
 466         PRECALC_OFFSET = 0
 467         .set i, 0
 468         .rept    160
 469                 PRECALC i
 470                 .set i, i + 1
 471         .endr
 472         PRECALC_OFFSET = 128
 473         xchg    WK_BUF, PRECALC_BUF
 474
 475         .align 32
 476 _loop:
 477         /*
 478          * code loops through more than one block
 479          * we use K_BASE value as a signal of a last block,
 480          * it is set below by: cmovae BUFFER_PTR, K_BASE
 481          */
 482         cmp     K_BASE, BUFFER_PTR
 483         jne     _begin
 484         .align 32
 485         jmp     _end
 486         .align 32
 487 _begin:
 488
 489         /*
 490          * Do first block
 491          * rounds: 0,2,4,6,8
 492          */
 493         .set j, 0
 494         .rept 5
 495                 RR      j
 496                 .set j, j+2
 497         .endr
 498
 499         jmp _loop0
 500 _loop0:
 501
 502         /*
 503          * rounds:
 504          * 10,12,14,16,18
 505          * 20,22,24,26,28
 506          * 30,32,34,36,38
 507          * 40,42,44,46,48
 508          * 50,52,54,56,58
 509          */
 510         .rept 25
 511                 RR      j
 512                 .set j, j+2
 513         .endr
 514
 515         add     $(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
 516         cmp     BUFFER_END, BUFFER_PTR    /* is current block the last one? */
 517         cmovae  K_BASE, BUFFER_PTR      /* signal the last iteration smartly */
 518
 519         /*
 520          * rounds
 521          * 60,62,64,66,68
 522          * 70,72,74,76,78
 523          */
 524         .rept 10
 525                 RR      j
 526                 .set j, j+2
 527         .endr
 528
 529         UPDATE_HASH     (HASH_PTR), A
 530         UPDATE_HASH     4(HASH_PTR), TB
 531         UPDATE_HASH     8(HASH_PTR), C
 532         UPDATE_HASH     12(HASH_PTR), D
 533         UPDATE_HASH     16(HASH_PTR), E
 534
 535         cmp     K_BASE, BUFFER_PTR      /* is current block the last one? */
 536         je      _loop
 537
 538         mov     TB, B
 539
 540         /* Process second block */
 541         /*
 542          * rounds
 543          *  0+80, 2+80, 4+80, 6+80, 8+80
 544          * 10+80,12+80,14+80,16+80,18+80
 545          */
 546
 547         .set j, 0
 548         .rept 10
 549                 RR      j+80
 550                 .set j, j+2
 551         .endr
 552
 553         jmp     _loop1
 554 _loop1:
 555         /*
 556          * rounds
 557          * 20+80,22+80,24+80,26+80,28+80
 558          * 30+80,32+80,34+80,36+80,38+80
 559          */
 560         .rept 10
 561                 RR      j+80
 562                 .set j, j+2
 563         .endr
 564
 565         jmp     _loop2
 566 _loop2:
 567
 568         /*
 569          * rounds
 570          * 40+80,42+80,44+80,46+80,48+80
 571          * 50+80,52+80,54+80,56+80,58+80
 572          */
 573         .rept 10
 574                 RR      j+80
 575                 .set j, j+2
 576         .endr
 577
 578         add     $(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
 579
 580         cmp     BUFFER_END, BUFFER_PTR2   /* is current block the last one */
 581         cmovae  K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
 582
 583         jmp     _loop3
 584 _loop3:
 585
 586         /*
 587          * rounds
 588          * 60+80,62+80,64+80,66+80,68+80
 589          * 70+80,72+80,74+80,76+80,78+80
 590          */
 591         .rept 10
 592                 RR      j+80
 593                 .set j, j+2
 594         .endr
 595
 596         UPDATE_HASH     (HASH_PTR), A
 597         UPDATE_HASH     4(HASH_PTR), TB
 598         UPDATE_HASH     8(HASH_PTR), C
 599         UPDATE_HASH     12(HASH_PTR), D
 600         UPDATE_HASH     16(HASH_PTR), E
 601
 602         /* Reset state for AVX2 reg permutation */
 603         mov     A, TA
 604         mov     TB, A
 605         mov     C, TB
 606         mov     E, C
 607         mov     D, B
 608         mov     TA, D
 609
 610         REGALLOC
 611
 612         xchg    WK_BUF, PRECALC_BUF
 613
 614         jmp     _loop
 615
 616         .align 32
 617         _end:
 618
 619 .endm
 620 /*
 621  * macro implements SHA-1 function's body for several 64-byte blocks
 622  * param: function's name
 623  */
 624 .macro SHA1_VECTOR_ASM  name
 625         ENTRY(\name)
 626
 627         push    %rbx
 628         push    %rbp
 629         push    %r12
 630         push    %r13
 631         push    %r14
 632         push    %r15
 633
 634         RESERVE_STACK  = (W_SIZE*4 + 8+24)
 635
 636         /* Align stack */
 637         mov     %rsp, %rbx
 638         and     $~(0x20-1), %rsp
 639         push    %rbx
 640         sub     $RESERVE_STACK, %rsp
 641
 642         avx2_zeroupper
 643
 644         lea     K_XMM_AR(%rip), K_BASE
 645
 646         mov     CTX, HASH_PTR
 647         mov     BUF, BUFFER_PTR
 648         lea     64(BUF), BUFFER_PTR2
 649
 650         shl     $6, CNT                 /* mul by 64 */
 651         add     BUF, CNT
 652         add     $64, CNT
 653         mov     CNT, BUFFER_END
 654
 655         cmp     BUFFER_END, BUFFER_PTR2
 656         cmovae  K_BASE, BUFFER_PTR2
 657
 658         xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 659
 660         SHA1_PIPELINED_MAIN_BODY
 661
 662         avx2_zeroupper
 663
 664         add     $RESERVE_STACK, %rsp
 665         pop     %rsp
 666
 667         pop     %r15
 668         pop     %r14
 669         pop     %r13
 670         pop     %r12
 671         pop     %rbp
 672         pop     %rbx
 673
 674         ret
 675
 676         ENDPROC(\name)
 677 .endm
 678
 679 .section .rodata
 680
 681 #define K1 0x5a827999
 682 #define K2 0x6ed9eba1
 683 #define K3 0x8f1bbcdc
 684 #define K4 0xca62c1d6
 685
 686 .align 128
 687 K_XMM_AR:
 688         .long K1, K1, K1, K1
 689         .long K1, K1, K1, K1
 690         .long K2, K2, K2, K2
 691         .long K2, K2, K2, K2
 692         .long K3, K3, K3, K3
 693         .long K3, K3, K3, K3
 694         .long K4, K4, K4, K4
 695         .long K4, K4, K4, K4
 696
 697 BSWAP_SHUFB_CTL:
 698         .long 0x00010203
 699         .long 0x04050607
 700         .long 0x08090a0b
 701         .long 0x0c0d0e0f
 702         .long 0x00010203
 703         .long 0x04050607
 704         .long 0x08090a0b
 705         .long 0x0c0d0e0f
 706 .text
 707
 708 SHA1_VECTOR_ASM     sha1_transform_avx2