ceph/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 %include "reg_sizes.asm"
  31
  32 [bits 64]
  33 default rel
  34 section .text
  35
  36 ; Virtual Registers
  37 %ifidn __OUTPUT_FORMAT__, win64
  38         %define msg     rcx ; ARG1
  39         %define digest  rdx ; ARG2
  40         %define msglen  r8  ; ARG3
  41         %define T1      rsi
  42         %define T2      rdi
  43 %else
  44         %define msg     rdi ; ARG1
  45         %define digest  rsi ; ARG2
  46         %define msglen  rdx ; ARG3
  47         %define T1      rcx
  48         %define T2      r8
  49 %endif
  50 %define a_64    r9
  51 %define b_64    r10
  52 %define c_64    r11
  53 %define d_64    r12
  54 %define e_64    r13
  55 %define f_64    r14
  56 %define g_64    r15
  57 %define h_64    rbx
  58 %define tmp0    rax
  59
  60 ; Local variables (stack frame)
  61 ; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
  62 struc frame
  63         .W:       resq 80 ; Message Schedule
  64         .WK:      resq  2 ; W[t] + K[t] | W[t+1] + K[t+1]
  65
  66 %ifidn __OUTPUT_FORMAT__, win64
  67         .GPRSAVE: resq 7
  68 %else
  69         .GPRSAVE: resq 5
  70 %endif
  71 endstruc
  72
  73 ; Useful QWORD "arrays" for simpler memory references
  74 %define MSG(i)    msg    + 8*(i)               ; Input message (arg1)
  75 %define DIGEST(i) digest + 8*(i)               ; Output Digest (arg2)
  76 %define K_t(i)    K512   + 8*(i)               ; SHA Constants (static mem)
  77 %define W_t(i)    rsp + frame.W  + 8*(i)       ; Message Schedule (stack frame)
  78 %define WK_2(i)   rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
  79 ; MSG, DIGEST, K_t, W_t are arrays
  80 ; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
  81
  82 %macro RotateState 0
  83         ; Rotate symbles a..h right
  84         %xdefine %%TMP h_64
  85         %xdefine h_64  g_64
  86         %xdefine g_64  f_64
  87         %xdefine f_64  e_64
  88         %xdefine e_64  d_64
  89         %xdefine d_64  c_64
  90         %xdefine c_64  b_64
  91         %xdefine b_64  a_64
  92         %xdefine a_64  %%TMP
  93 %endmacro
  94
  95 %macro SHA512_Round 1
  96 %assign %%t   (%1)
  97
  98         ; Compute Round %%t
  99         mov     T1,   f_64        ; T1 = f
 100         mov     tmp0, e_64        ; tmp = e
 101         xor     T1,   g_64        ; T1 = f ^ g
 102         ror     tmp0, 23 ; 41     ; tmp = e ror 23
 103         and     T1,   e_64        ; T1 = (f ^ g) & e
 104         xor     tmp0, e_64        ; tmp = (e ror 23) ^ e
 105         xor     T1,   g_64        ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
 106         add     T1,   [WK_2(%%t)] ; W[t] + K[t] from message scheduler
 107         ror     tmp0, 4 ; 18      ; tmp = ((e ror 23) ^ e) ror 4
 108         xor     tmp0, e_64        ; tmp = (((e ror 23) ^ e) ror 4) ^ e
 109         mov     T2,   a_64        ; T2 = a
 110         add     T1,   h_64        ; T1 = CH(e,f,g) + W[t] + K[t] + h
 111         ror     tmp0, 14 ; 14     ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
 112         add     T1,   tmp0        ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
 113         mov     tmp0, a_64        ; tmp = a
 114         xor     T2,   c_64        ; T2 = a ^ c
 115         and     tmp0, c_64        ; tmp = a & c
 116         and     T2,   b_64        ; T2 = (a ^ c) & b
 117         xor     T2,   tmp0        ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
 118         mov     tmp0, a_64        ; tmp = a
 119         ror     tmp0, 5 ; 39      ; tmp = a ror 5
 120         xor     tmp0, a_64        ; tmp = (a ror 5) ^ a
 121         add     d_64, T1          ; e(next_state) = d + T1
 122         ror     tmp0, 6 ; 34      ; tmp = ((a ror 5) ^ a) ror 6
 123         xor     tmp0, a_64        ; tmp = (((a ror 5) ^ a) ror 6) ^ a
 124         lea     h_64, [T1 + T2]   ; a(next_state) = T1 + Maj(a,b,c)
 125         ror     tmp0, 28 ; 28     ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
 126         add     h_64, tmp0        ; a(next_state) = T1 + Maj(a,b,c) S0(a)
 127         RotateState
 128 %endmacro
 129
 130 %macro SHA512_2Sched_2Round_sse 1
 131 %assign %%t (%1)
 132
 133         ; Compute rounds %%t-2 and %%t-1
 134         ; Compute message schedule QWORDS %%t and %%t+1
 135
 136         ;   Two rounds are computed based on the values for K[t-2]+W[t-2] and
 137         ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
 138         ; scheduler.
 139         ;   The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
 140         ; They are then added to their respective SHA512 constants at
 141         ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
 142         ;   For brievity, the comments following vectored instructions only refer to
 143         ; the first of a pair of QWORDS.
 144         ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
 145         ;   The computation of the message schedule and the rounds are tightly
 146         ; stitched to take advantage of instruction-level parallelism.
 147         ; For clarity, integer instructions (for the rounds calculation) are indented
 148         ; by one tab. Vectored instructions (for the message scheduler) are indented
 149         ; by two tabs.
 150
 151         mov     T1, f_64
 152         movdqa  xmm2, [W_t(%%t-2)]  ; XMM2 = W[t-2]
 153         xor     T1,   g_64
 154         and     T1,   e_64
 155         movdqa  xmm0, xmm2          ; XMM0 = W[t-2]
 156         xor     T1,   g_64
 157         add     T1,   [WK_2(%%t)]
 158         movdqu  xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
 159         mov     tmp0, e_64
 160         ror     tmp0, 23 ; 41
 161         movdqa  xmm3, xmm5          ; XMM3 = W[t-15]
 162         xor     tmp0, e_64
 163         ror     tmp0, 4 ; 18
 164         psrlq   xmm0, 61 - 19       ; XMM0 = W[t-2] >> 42
 165         xor     tmp0, e_64
 166         ror     tmp0, 14 ; 14
 167         psrlq   xmm3, (8 - 7)       ; XMM3 = W[t-15] >> 1
 168         add     T1,   tmp0
 169         add     T1,   h_64
 170         pxor    xmm0, xmm2          ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
 171         mov     T2,   a_64
 172         xor     T2,   c_64
 173         pxor    xmm3, xmm5          ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
 174         and     T2,   b_64
 175         mov     tmp0, a_64
 176         psrlq   xmm0, 19 - 6        ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
 177         and     tmp0, c_64
 178         xor     T2,   tmp0
 179         psrlq   xmm3, (7 - 1)       ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
 180         mov     tmp0, a_64
 181         ror     tmp0, 5 ; 39
 182         pxor    xmm0, xmm2          ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
 183         xor     tmp0, a_64
 184         ror     tmp0, 6 ; 34
 185         pxor    xmm3, xmm5          ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
 186         xor     tmp0, a_64
 187         ror     tmp0, 28 ; 28
 188         psrlq   xmm0, 6             ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
 189         add     T2,   tmp0
 190         add     d_64, T1
 191         psrlq   xmm3, 1             ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
 192         lea     h_64, [T1 + T2]
 193         RotateState
 194         movdqa  xmm1, xmm2          ; XMM1 = W[t-2]
 195         mov     T1, f_64
 196         xor     T1,   g_64
 197         movdqa  xmm4, xmm5          ; XMM4 = W[t-15]
 198         and     T1,   e_64
 199         xor     T1,   g_64
 200         psllq   xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
 201         add     T1,   [WK_2(%%t+1)]
 202         mov     tmp0, e_64
 203         psllq   xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
 204         ror     tmp0, 23 ; 41
 205         xor     tmp0, e_64
 206         pxor    xmm1, xmm2          ; XMM1 = (W[t-2] << 42)^W[t-2]
 207         ror     tmp0, 4 ; 18
 208         xor     tmp0, e_64
 209         pxor    xmm4, xmm5          ; XMM4 = (W[t-15]<<7)^W[t-15]
 210         ror     tmp0, 14 ; 14
 211         add     T1,   tmp0
 212         psllq   xmm1, (64 - 61)     ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
 213         add     T1,   h_64
 214         mov     T2,   a_64
 215         psllq   xmm4, (64 - 8)      ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
 216         xor     T2,   c_64
 217         and     T2,   b_64
 218         pxor    xmm0, xmm1          ; XMM0 = s1(W[t-2])
 219         mov     tmp0, a_64
 220         and     tmp0, c_64
 221         movdqu  xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
 222         xor     T2,   tmp0
 223         pxor    xmm3, xmm4          ; XMM3 = s0(W[t-15])
 224         mov     tmp0, a_64
 225         paddq   xmm0, xmm3          ; XMM0 = s1(W[t-2]) + s0(W[t-15])
 226         ror     tmp0, 5 ; 39
 227         paddq   xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
 228         xor     tmp0, a_64
 229         paddq   xmm0, xmm1          ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
 230         ror     tmp0, 6 ; 34
 231         movdqa  [W_t(%%t)], xmm0    ; Store scheduled qwords
 232         xor     tmp0, a_64
 233         paddq   xmm0, [K_t(t)]      ; Compute W[t]+K[t]
 234         ror     tmp0, 28 ; 28
 235         movdqa  [WK_2(t)], xmm0     ; Store W[t]+K[t] for next rounds
 236         add     T2,   tmp0
 237         add     d_64, T1
 238         lea     h_64, [T1 + T2]
 239         RotateState
 240 %endmacro
 241
 242 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 243 ; void sha512_sse4(const void* M, void* D, uint64_t L);
 244 ; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 245 ; The size of the message pointed to by M must be an integer multiple of SHA512
 246 ;   message blocks.
 247 ; L is the message length in SHA512 blocks.
 248 mk_global sha512_sse4, function
 249 sha512_sse4:
 250         endbranch
 251         cmp msglen, 0
 252         je .nowork
 253
 254         ; Allocate Stack Space
 255         sub     rsp, frame_size
 256
 257         ; Save GPRs
 258         mov     [rsp + frame.GPRSAVE + 8 * 0], rbx
 259         mov     [rsp + frame.GPRSAVE + 8 * 1], r12
 260         mov     [rsp + frame.GPRSAVE + 8 * 2], r13
 261         mov     [rsp + frame.GPRSAVE + 8 * 3], r14
 262         mov     [rsp + frame.GPRSAVE + 8 * 4], r15
 263 %ifidn __OUTPUT_FORMAT__, win64
 264         mov     [rsp + frame.GPRSAVE + 8 * 5], rsi
 265         mov     [rsp + frame.GPRSAVE + 8 * 6], rdi
 266 %endif
 267
 268 .updateblock:
 269
 270         ; Load state variables
 271         mov     a_64, [DIGEST(0)]
 272         mov     b_64, [DIGEST(1)]
 273         mov     c_64, [DIGEST(2)]
 274         mov     d_64, [DIGEST(3)]
 275         mov     e_64, [DIGEST(4)]
 276         mov     f_64, [DIGEST(5)]
 277         mov     g_64, [DIGEST(6)]
 278         mov     h_64, [DIGEST(7)]
 279
 280         %assign t 0
 281         %rep 80/2 + 1
 282         ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
 283         ; +1 iteration because the scheduler leads hashing by 1 iteration
 284                 %if t < 2
 285                         ; BSWAP 2 QWORDS
 286                         movdqa  xmm1, [XMM_QWORD_BSWAP]
 287                         movdqu  xmm0, [MSG(t)]
 288                         pshufb  xmm0, xmm1      ; BSWAP
 289                         movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
 290                         paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
 291                         movdqa  [WK_2(t)], xmm0 ; Store into WK for rounds
 292                 %elif t < 16
 293                         ; BSWAP 2 QWORDS; Compute 2 Rounds
 294                         movdqu  xmm0, [MSG(t)]
 295                         pshufb  xmm0, xmm1      ; BSWAP
 296                         SHA512_Round t - 2      ; Round t-2
 297                         movdqa  [W_t(t)], xmm0  ; Store Scheduled Pair
 298                         paddq   xmm0, [K_t(t)]  ; Compute W[t]+K[t]
 299                         SHA512_Round t - 1      ; Round t-1
 300                         movdqa  [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
 301                 %elif t < 79
 302                         ; Schedule 2 QWORDS; Compute 2 Rounds
 303                         SHA512_2Sched_2Round_sse t
 304                 %else
 305                         ; Compute 2 Rounds
 306                         SHA512_Round t - 2
 307                         SHA512_Round t - 1
 308                 %endif
 309         %assign t t+2
 310         %endrep
 311
 312         ; Update digest
 313         add     [DIGEST(0)], a_64
 314         add     [DIGEST(1)], b_64
 315         add     [DIGEST(2)], c_64
 316         add     [DIGEST(3)], d_64
 317         add     [DIGEST(4)], e_64
 318         add     [DIGEST(5)], f_64
 319         add     [DIGEST(6)], g_64
 320         add     [DIGEST(7)], h_64
 321
 322         ; Advance to next message block
 323         add     msg, 16*8
 324         dec     msglen
 325         jnz     .updateblock
 326
 327         ; Restore GPRs
 328         mov     rbx, [rsp + frame.GPRSAVE + 8 * 0]
 329         mov     r12, [rsp + frame.GPRSAVE + 8 * 1]
 330         mov     r13, [rsp + frame.GPRSAVE + 8 * 2]
 331         mov     r14, [rsp + frame.GPRSAVE + 8 * 3]
 332         mov     r15, [rsp + frame.GPRSAVE + 8 * 4]
 333 %ifidn __OUTPUT_FORMAT__, win64
 334         mov     rsi, [rsp + frame.GPRSAVE + 8 * 5]
 335         mov     rdi, [rsp + frame.GPRSAVE + 8 * 6]
 336 %endif
 337         ; Restore Stack Pointer
 338         add     rsp, frame_size
 339
 340 .nowork:
 341         ret
 342
 343 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 344 ;;; Binary Data
 345
 346 section .data
 347
 348 ALIGN 16
 349
 350 ; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
 351 XMM_QWORD_BSWAP:
 352         dq 0x0001020304050607, 0x08090a0b0c0d0e0f
 353
 354 ; K[t] used in SHA512 hashing
 355 K512:
 356         dq 0x428a2f98d728ae22,0x7137449123ef65cd
 357         dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
 358         dq 0x3956c25bf348b538,0x59f111f1b605d019
 359         dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
 360         dq 0xd807aa98a3030242,0x12835b0145706fbe
 361         dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
 362         dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
 363         dq 0x9bdc06a725c71235,0xc19bf174cf692694
 364         dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
 365         dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
 366         dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
 367         dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
 368         dq 0x983e5152ee66dfab,0xa831c66d2db43210
 369         dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
 370         dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
 371         dq 0x06ca6351e003826f,0x142929670a0e6e70
 372         dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
 373         dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
 374         dq 0x650a73548baf63de,0x766a0abb3c77b2a8
 375         dq 0x81c2c92e47edaee6,0x92722c851482353b
 376         dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
 377         dq 0xc24b8b70d0f89791,0xc76c51a30654be30
 378         dq 0xd192e819d6ef5218,0xd69906245565a910
 379         dq 0xf40e35855771202a,0x106aa07032bbd1b8
 380         dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
 381         dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
 382         dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
 383         dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
 384         dq 0x748f82ee5defb2fc,0x78a5636f43172f60
 385         dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
 386         dq 0x90befffa23631e28,0xa4506cebde82bde9
 387         dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
 388         dq 0xca273eceea26619c,0xd186b8c721c0c207
 389         dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
 390         dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
 391         dq 0x113f9804bef90dae,0x1b710b35131c471b
 392         dq 0x28db77f523047d84,0x32caab7b40c72493
 393         dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
 394         dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
 395         dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
 396