ceph/src/spdk/intel-ipsec-mb/sse/mb_mgr_hmac_sha_256_submit_ni_sse.asm

   1 ;;
   2 ;; Copyright (c) 2012-2018, Intel Corporation
   3 ;;
   4 ;; Redistribution and use in source and binary forms, with or without
   5 ;; modification, are permitted provided that the following conditions are met:
   6 ;;
   7 ;;     * Redistributions of source code must retain the above copyright notice,
   8 ;;       this list of conditions and the following disclaimer.
   9 ;;     * Redistributions in binary form must reproduce the above copyright
  10 ;;       notice, this list of conditions and the following disclaimer in the
  11 ;;       documentation and/or other materials provided with the distribution.
  12 ;;     * Neither the name of Intel Corporation nor the names of its contributors
  13 ;;       may be used to endorse or promote products derived from this software
  14 ;;       without specific prior written permission.
  15 ;;
  16 ;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17 ;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18 ;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19 ;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
  20 ;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21 ;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22 ;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23 ;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24 ;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25 ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26 ;;
  27
  28 ;; In System V AMD64 ABI
  29 ;;      calle saves: RBX, RBP, R12-R15
  30 ;; Windows x64 ABI
  31 ;;      calle saves: RBX, RBP, RDI, RSI, RSP, R12-R15
  32 ;;
  33 ;; Linux/Windows clobbers: xmm0 - xmm15
  34 ;;
  35
  36 %include "include/os.asm"
  37 %include "job_aes_hmac.asm"
  38 %include "mb_mgr_datastruct.asm"
  39 %include "include/reg_sizes.asm"
  40 %include "include/memcpy.asm"
  41
  42 ;%define DO_DBGPRINT
  43 %include "include/dbgprint.asm"
  44
  45 extern sha256_ni
  46
  47 %ifdef LINUX
  48 %define arg1    rdi
  49 %define arg2    rsi
  50 %define reg3    rcx
  51 %define reg4    rdx
  52 %else
  53 %define arg1    rcx
  54 %define arg2    rdx
  55 %define reg3    rdi
  56 %define reg4    rsi
  57 %endif
  58
  59 %define state   arg1
  60 %define job     arg2
  61 %define len2    arg2
  62
  63
  64 ; idx needs to be in rbx, rbp, r13-r15
  65 %define last_len        rbp
  66 %define idx             rbp
  67
  68 %define p               r11
  69 %define start_offset    r11
  70
  71 %define unused_lanes    rbx
  72 %define tmp4            rbx
  73
  74 %define job_rax         rax
  75 %define len             rax
  76
  77 %define size_offset     reg3
  78 %define tmp2            reg3
  79
  80 %define lane            reg4
  81
  82 %define extra_blocks    r8
  83
  84 %define tmp             r9
  85 %define p2              r9
  86
  87 %define lane_data       r10
  88
  89 %define bswap_xmm4      xmm4
  90
  91 struc STACK
  92 _gpr_save:      resq    4       ; rbx, rbp, rsi (win), rdi (win)
  93 _rsp_save:      resq    1
  94 endstruc
  95
  96 section .data
  97 default rel
  98
  99 align 16
 100 byteswap:
 101         dq 0x0405060700010203
 102         dq 0x0c0d0e0f08090a0b
 103
 104 section .text
 105
 106 %ifdef SHA224
 107 ; JOB* submit_job_hmac_sha_224_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
 108 ; arg 1 : state
 109 ; arg 2 : job
 110 MKGLOBAL(submit_job_hmac_sha_224_ni_sse,function,internal)
 111 submit_job_hmac_sha_224_ni_sse:
 112
 113 %else
 114
 115 ; JOB* submit_job_hmac_sha_256_ni_sse(MB_MGR_HMAC_SHA_256_OOO *state, JOB_AES_HMAC *job)
 116 ; arg 1 : state
 117 ; arg 2 : job
 118 MKGLOBAL(submit_job_hmac_sha_256_ni_sse,function,internal)
 119 submit_job_hmac_sha_256_ni_sse:
 120 %endif
 121
 122         mov     rax, rsp
 123         sub     rsp, STACK_size
 124         and     rsp, -16
 125
 126         mov     [rsp + _gpr_save + 8*0], rbx
 127         mov     [rsp + _gpr_save + 8*1], rbp
 128 %ifndef LINUX
 129         mov     [rsp + _gpr_save + 8*2], rsi
 130         mov     [rsp + _gpr_save + 8*3], rdi
 131 %endif
 132         mov     [rsp + _rsp_save], rax  ; original SP
 133
 134         DBGPRINTL "enter sha256-ni-sse submit"
 135
 136         mov     unused_lanes, [state + _unused_lanes_sha256]
 137         movzx   lane, BYTE(unused_lanes)
 138         DBGPRINTL64 "lane: ", lane
 139         shr     unused_lanes, 8
 140         imul    lane_data, lane, _HMAC_SHA1_LANE_DATA_size ; SHA1 & SHA256 lane data is the same
 141         lea     lane_data, [state + _ldata_sha256 + lane_data]
 142         mov     [state + _unused_lanes_sha256], unused_lanes
 143         mov     len, [job + _msg_len_to_hash_in_bytes]
 144         DBGPRINTL64 "length: ", len
 145         mov     tmp, len
 146         shr     tmp, 6  ; divide by 64, len in terms of blocks
 147
 148         mov     [lane_data + _job_in_lane], job
 149         mov     dword [lane_data + _outer_done], 0
 150         mov     [state + _lens_sha256 + 2*lane], WORD(tmp)
 151
 152         mov     last_len, len
 153         and     last_len, 63
 154         lea     extra_blocks, [last_len + 9 + 63]
 155         shr     extra_blocks, 6
 156         mov     [lane_data + _extra_blocks], DWORD(extra_blocks)
 157
 158         mov     p, [job + _src]
 159         add     p, [job + _hash_start_src_offset_in_bytes]
 160         mov     [state + _args_data_ptr_sha256 + 8*lane], p
 161
 162         cmp     len, 64
 163         jb      copy_lt64
 164
 165 fast_copy:
 166         add     p, len
 167         movdqu  xmm0, [p - 64 + 0*16]
 168         movdqu  xmm1, [p - 64 + 1*16]
 169         movdqu  xmm2, [p - 64 + 2*16]
 170         movdqu  xmm3, [p - 64 + 3*16]
 171         movdqa  [lane_data + _extra_block + 0*16], xmm0
 172         movdqa  [lane_data + _extra_block + 1*16], xmm1
 173         movdqa  [lane_data + _extra_block + 2*16], xmm2
 174         movdqa  [lane_data + _extra_block + 3*16], xmm3
 175 end_fast_copy:
 176
 177         mov     size_offset, extra_blocks
 178         shl     size_offset, 6
 179         sub     size_offset, last_len
 180         add     size_offset, 64-8
 181         mov     [lane_data + _size_offset], DWORD(size_offset)
 182         mov     start_offset, 64
 183         sub     start_offset, last_len
 184         mov     [lane_data + _start_offset], DWORD(start_offset)
 185
 186         lea     tmp, [8*64 + 8*len]
 187         bswap   tmp
 188         mov     [lane_data + _extra_block + size_offset], tmp
 189
 190         mov     tmp, [job + _auth_key_xor_ipad]
 191         movdqu  xmm0, [tmp]
 192         movdqu  xmm1, [tmp + 4*4]
 193 %if SHA256NI_DIGEST_ROW_SIZE != 32
 194 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
 195 %endif
 196         lea     tmp, [lane*8]   ; x8 here plus x4 scale factor give x32
 197         movdqu  [state + _args_digest_sha256 + tmp*4], xmm0
 198         movdqu  [state + _args_digest_sha256 + tmp*4 + 4*4], xmm1
 199         DBGPRINTL       "args digest:"
 200         DBGPRINT_XMM    xmm0
 201         DBGPRINT_XMM    xmm1
 202         test    len, ~63
 203         jnz     ge64_bytes
 204
 205 lt64_bytes:
 206         mov     [state + _lens_sha256 + 2*lane], WORD(extra_blocks)
 207         lea     tmp, [lane_data + _extra_block + start_offset]
 208         mov     [state + _args_data_ptr_sha256 + 8*lane], tmp
 209         mov     dword [lane_data + _extra_blocks], 0
 210
 211 ge64_bytes:
 212         cmp     unused_lanes, 0xff
 213         jne     return_null
 214         jmp     start_loop
 215
 216         align   16
 217 start_loop:
 218         ; Find min length - only two lanes available
 219         xor     len2, len2
 220         mov     tmp, 0x10000
 221         mov     WORD(len2), word [state + _lens_sha256 + 0*2]   ; [0:15] - lane 0 length, [16:31] - lane index (0)
 222         mov     WORD(tmp), word [state + _lens_sha256 + 1*2]    ; [0:15] - lane 1 length, [16:31] - lane index (1)
 223         cmp     WORD(len2), WORD(tmp)
 224         cmovg   DWORD(len2), DWORD(tmp) ; move if lane 0 length is greater than lane 1 length
 225
 226         mov     idx, len2               ; retrieve index & length from [16:31] and [0:15] bit fields
 227         shr     DWORD(idx), 16
 228         and     DWORD(len2), 0xffff
 229         je      len_is_0
 230
 231         sub     word [state + _lens_sha256 + 0*2], WORD(len2)
 232         sub     word [state + _lens_sha256 + 1*2], WORD(len2)
 233
 234         ; "state" and "args" are the same address, arg1
 235         ; len is arg2
 236         call    sha256_ni
 237         ; state is intact
 238 len_is_0:
 239         ; process completed job "idx"
 240         imul    lane_data, idx, _HMAC_SHA1_LANE_DATA_size
 241         lea     lane_data, [state + _ldata_sha256 + lane_data]
 242         mov     DWORD(extra_blocks), [lane_data + _extra_blocks]
 243         cmp     extra_blocks, 0
 244         jne     proc_extra_blocks
 245         movdqa  bswap_xmm4, [rel byteswap]
 246         cmp     dword [lane_data + _outer_done], 0
 247         jne     end_loop
 248
 249 proc_outer:
 250         mov     dword [lane_data + _outer_done], 1
 251         mov     DWORD(size_offset), [lane_data + _size_offset]
 252         mov     qword [lane_data + _extra_block + size_offset], 0
 253         mov     word [state + _lens_sha256 + 2*idx], 1
 254         lea     tmp, [lane_data + _outer_block]
 255         mov     job, [lane_data + _job_in_lane]
 256         mov     [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
 257
 258 %if SHA256NI_DIGEST_ROW_SIZE != 32
 259 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
 260 %endif
 261         lea     tmp4, [idx*8]   ; x8 here + scale factor x4 below give x32
 262         movdqu  xmm0, [state + _args_digest_sha256 + tmp4*4]
 263         movdqu  xmm1, [state + _args_digest_sha256 + tmp4*4 + 4*4]
 264         pshufb  xmm0, bswap_xmm4
 265         pshufb  xmm1, bswap_xmm4
 266         movdqa  [lane_data + _outer_block], xmm0
 267         movdqa  [lane_data + _outer_block + 4*4], xmm1
 268 %ifdef SHA224
 269         ;; overwrite top 4 bytes with 0x80
 270         mov     dword [lane_data + _outer_block + 7*4], 0x80
 271 %endif
 272
 273         mov     tmp, [job + _auth_key_xor_opad]
 274         movdqu  xmm0, [tmp]
 275         movdqu  xmm1,  [tmp + 4*4]
 276         movdqu  [state + _args_digest_sha256 + tmp4*4], xmm0
 277         movdqu  [state + _args_digest_sha256 + tmp4*4 + 4*4], xmm1
 278         jmp     start_loop
 279
 280         align   16
 281 proc_extra_blocks:
 282         mov     DWORD(start_offset), [lane_data + _start_offset]
 283         mov     [state + _lens_sha256 + 2*idx], WORD(extra_blocks)
 284         lea     tmp, [lane_data + _extra_block + start_offset]
 285         mov     [state + _args_data_ptr_sha256 + PTR_SZ*idx], tmp
 286         mov     dword [lane_data + _extra_blocks], 0
 287         jmp     start_loop
 288
 289         align   16
 290
 291 copy_lt64:
 292         ;; less than one message block of data
 293         ;; beginning of source block
 294         ;; destination extrablock but backwards by len from where 0x80 pre-populated
 295         ;; p2 clobbers unused_lanes, undo before exit
 296         lea     p2, [lane_data + _extra_block  + 64]
 297         sub     p2, len
 298         memcpy_sse_64_1 p2, p, len, tmp4, tmp2, xmm0, xmm1, xmm2, xmm3
 299         mov     unused_lanes, [state + _unused_lanes_sha256]
 300         jmp     end_fast_copy
 301
 302 return_null:
 303         xor     job_rax, job_rax
 304         jmp     return
 305
 306         align   16
 307 end_loop:
 308         mov     job_rax, [lane_data + _job_in_lane]
 309         mov     unused_lanes, [state + _unused_lanes_sha256]
 310         mov     qword [lane_data + _job_in_lane], 0
 311         or      dword [job_rax + _status], STS_COMPLETED_HMAC
 312         shl     unused_lanes, 8
 313         or      unused_lanes, idx
 314         mov     [state + _unused_lanes_sha256], unused_lanes
 315
 316         mov     p, [job_rax + _auth_tag_output]
 317
 318         ; copy 16 bytes for SHA256, 14 for SHA224
 319 %if SHA256NI_DIGEST_ROW_SIZE != 32
 320 %error "Below code has been optimized for SHA256NI_DIGEST_ROW_SIZE = 32!"
 321 %endif
 322         shl     idx, 5
 323
 324 %ifdef SHA224
 325         cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 14
 326         jne     copy_full_digest
 327 %else
 328         cmp     qword [job_rax + _auth_tag_output_len_in_bytes], 16
 329         jne     copy_full_digest
 330 %endif
 331
 332         movdqu  xmm0,  [state + _args_digest_sha256 + idx]
 333         pshufb  xmm0, bswap_xmm4
 334 %ifdef SHA224
 335         ;; SHA224
 336         movq    [p + 0*4], xmm0
 337         pextrd  [p + 2*4], xmm0, 2
 338         pextrw  [p + 3*4], xmm0, 6
 339 %else
 340         ;; SHA256
 341         movdqu  [p], xmm0
 342 %endif
 343         jmp     clear_ret
 344
 345 copy_full_digest:
 346         movdqu  xmm0,  [state + _args_digest_sha256 + idx]
 347         movdqu  xmm1,  [state + _args_digest_sha256 + idx + 16]
 348         pshufb  xmm0, bswap_xmm4
 349         pshufb  xmm1, bswap_xmm4
 350 %ifdef SHA224
 351         ;; SHA224
 352         movdqu  [p], xmm0
 353         movq    [p + 16], xmm1
 354         pextrd  [p + 16 + 8], xmm1, 2
 355 %else
 356         ;; SHA256
 357         movdqu  [p], xmm0
 358         movdqu  [p + 16], xmm1
 359 %endif
 360
 361 clear_ret:
 362
 363 %ifdef SAFE_DATA
 364         pxor    xmm0, xmm0
 365         ;; Clear digest, outer_block (28B/32B) and extra_block (64B) of returned job
 366         movdqa  [state + _args_digest_sha256 + idx], xmm0
 367         movdqa  [state + _args_digest_sha256 + idx + 16], xmm0
 368
 369         shr     idx, 5 ;; Restore lane idx to 0 or 1
 370         imul    lane_data, idx, _HMAC_SHA1_LANE_DATA_size
 371         lea     lane_data, [state + _ldata_sha256 + lane_data]
 372         ;; Clear first 64 bytes of extra_block
 373 %assign offset 0
 374 %rep 4
 375         movdqa  [lane_data + _extra_block + offset], xmm0
 376 %assign offset (offset + 16)
 377 %endrep
 378
 379         ;; Clear first 28 bytes (SHA-224) or 32 bytes (SHA-256) of outer_block
 380         movdqa  [lane_data + _outer_block], xmm0
 381 %ifdef SHA224
 382         mov     qword [lane_data + _outer_block + 16], 0
 383         mov     dword [lane_data + _outer_block + 24], 0
 384 %else
 385         movdqa  [lane_data + _outer_block + 16], xmm0
 386 %endif
 387 %endif ;; SAFE_DATA
 388
 389 return:
 390         mov     rbx, [rsp + _gpr_save + 8*0]
 391         mov     rbp, [rsp + _gpr_save + 8*1]
 392 %ifndef LINUX
 393         mov     rsi, [rsp + _gpr_save + 8*2]
 394         mov     rdi, [rsp + _gpr_save + 8*3]
 395 %endif
 396         mov     rsp, [rsp + _rsp_save]  ; original SP
 397         ret
 398
 399 %ifdef LINUX
 400 section .note.GNU-stack noalloc noexec nowrite progbits
 401 %endif