ceph/src/isa-l/crc/crc64_iso_refl_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_iso_refl_by16_10(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 %include "reg_sizes.asm"
  39
  40 %ifndef FUNCTION_NAME
  41 %define FUNCTION_NAME crc64_iso_refl_by16_10
  42 %endif
  43
  44 %if (AS_FEATURE_LEVEL) >= 10
  45
  46 %define fetch_dist      1024
  47
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52
  53
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62 %endif
  63
  64 %define TMP 16*0
  65 %ifidn __OUTPUT_FORMAT__, win64
  66         %define XMM_SAVE 16*2
  67         %define VARIABLE_OFFSET 16*12+8
  68 %else
  69         %define VARIABLE_OFFSET 16*2+8
  70 %endif
  71
  72 align 16
  73 global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
  74 FUNCTION_NAME:
  75         not             arg1
  76         sub             rsp, VARIABLE_OFFSET
  77
  78 %ifidn __OUTPUT_FORMAT__, win64
  79         ; push the xmm registers into the stack to maintain
  80         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
  81         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
  82         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
  83         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
  84         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
  85         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
  86         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
  87         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
  88         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
  89         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
  90 %endif
  91
  92         cmp             arg3, 256
  93         jl              _less_than_256
  94
  95         ; load the initial crc value
  96         vmovq           xmm10, arg1      ; initial crc
  97
  98         ; receive the initial 128B data, xor the initial crc value
  99         vmovdqu8        zmm0, [arg2+16*0]
 100         vmovdqu8        zmm4, [arg2+16*4]
 101         vpxorq          zmm0, zmm10
 102         vbroadcasti32x4 zmm10, [rk3]    ;zmm10 has rk3 and rk4
 103                                         ;imm value of pclmulqdq instruction will determine which constant to use
 104
 105         sub             arg3, 256
 106         cmp             arg3, 256
 107         jl              _fold_128_B_loop
 108
 109         vmovdqu8        zmm7, [arg2+16*8]
 110         vmovdqu8        zmm8, [arg2+16*12]
 111         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 112         sub             arg3, 256
 113
 114 _fold_256_B_loop:
 115         add             arg2, 256
 116         vmovdqu8        zmm3, [arg2+16*0]
 117         vpclmulqdq      zmm1, zmm0, zmm16, 0x10
 118         vpclmulqdq      zmm2, zmm0, zmm16, 0x01
 119         vpxorq          zmm0, zmm1, zmm2
 120         vpxorq          zmm0, zmm0, zmm3
 121
 122         vmovdqu8        zmm9, [arg2+16*4]
 123         vpclmulqdq      zmm5, zmm4, zmm16, 0x10
 124         vpclmulqdq      zmm6, zmm4, zmm16, 0x01
 125         vpxorq          zmm4, zmm5, zmm6
 126         vpxorq          zmm4, zmm4, zmm9
 127
 128         vmovdqu8        zmm11, [arg2+16*8]
 129         vpclmulqdq      zmm12, zmm7, zmm16, 0x10
 130         vpclmulqdq      zmm13, zmm7, zmm16, 0x01
 131         vpxorq          zmm7, zmm12, zmm13
 132         vpxorq          zmm7, zmm7, zmm11
 133
 134         vmovdqu8        zmm17, [arg2+16*12]
 135         vpclmulqdq      zmm14, zmm8, zmm16, 0x10
 136         vpclmulqdq      zmm15, zmm8, zmm16, 0x01
 137         vpxorq          zmm8, zmm14, zmm15
 138         vpxorq          zmm8, zmm8, zmm17
 139
 140         sub             arg3, 256
 141         jge             _fold_256_B_loop
 142
 143         ;; Fold 256 into 128
 144         add             arg2, 256
 145         vpclmulqdq      zmm1, zmm0, zmm10, 0x01
 146         vpclmulqdq      zmm2, zmm0, zmm10, 0x10
 147         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 148
 149         vpclmulqdq      zmm5, zmm4, zmm10, 0x01
 150         vpclmulqdq      zmm6, zmm4, zmm10, 0x10
 151         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 152
 153         vmovdqa32       zmm0, zmm7
 154         vmovdqa32       zmm4, zmm8
 155
 156         add             arg3, 128
 157         jmp             _fold_128_B_register
 158
 159         ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
 160 _fold_128_B_loop:
 161         add             arg2, 128       ; update the buffer pointer
 162         vmovdqu8        zmm8, [arg2+16*0]
 163         vpclmulqdq      zmm1, zmm0, zmm10, 0x10
 164         vpclmulqdq      zmm2, zmm0, zmm10, 0x01
 165         vpxorq          zmm0, zmm1, zmm2
 166         vpxorq          zmm0, zmm0, zmm8
 167
 168         vmovdqu8        zmm9, [arg2+16*4]
 169         vpclmulqdq      zmm5, zmm4, zmm10, 0x10
 170         vpclmulqdq      zmm6, zmm4, zmm10, 0x01
 171         vpxorq          zmm4, zmm5, zmm6
 172         vpxorq          zmm4, zmm4, zmm9
 173
 174         sub             arg3, 128
 175         jge             _fold_128_B_loop
 176         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 177
 178         add     arg2, 128
 179         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 180         ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
 181
 182 _fold_128_B_register:
 183         ; fold the 8 128b parts into 1 xmm register with different constants
 184         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 185         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 186         vpclmulqdq      zmm1, zmm0, zmm16, 0x01
 187         vpclmulqdq      zmm2, zmm0, zmm16, 0x10
 188         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 189
 190         vpclmulqdq      zmm5, zmm4, zmm11, 0x01
 191         vpclmulqdq      zmm6, zmm4, zmm11, 0x10
 192         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 193         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 194         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 195
 196         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 197         vpxorq          ymm8, ymm8, ymm1
 198         vextracti64x2   xmm5, ymm8, 1
 199         vpxorq          xmm7, xmm5, xmm8
 200
 201         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 202         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 203         add             arg3, 128-16
 204         jl              _final_reduction_for_128
 205
 206         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 207         ; we can fold 16 bytes at a time if y>=16
 208         ; continue folding 16B at a time
 209
 210 _16B_reduction_loop:
 211         vmovdqa         xmm8, xmm7
 212         vpclmulqdq      xmm7, xmm10, 0x1
 213         vpclmulqdq      xmm8, xmm10, 0x10
 214         vpxor           xmm7, xmm8
 215         vmovdqu         xmm0, [arg2]
 216         vpxor           xmm7, xmm0
 217         add             arg2, 16
 218         sub             arg3, 16
 219         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 220         ; equivalent of: cmp arg3, 16-16
 221         ; check if there is any more 16B in the buffer to be able to fold
 222         jge             _16B_reduction_loop
 223
 224         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 225         ;first, we reduce the data in the xmm7 register
 226
 227
 228 _final_reduction_for_128:
 229         add             arg3, 16
 230         je              _128_done
 231         ; here we are getting data that is less than 16 bytes.
 232         ; since we know that there was data before the pointer, we can offset
 233         ; the input pointer before the actual point, to receive exactly 16 bytes.
 234         ; after that the registers need to be adjusted.
 235 _get_last_two_xmms:
 236
 237
 238         vmovdqa         xmm2, xmm7
 239         vmovdqu         xmm1, [arg2 - 16 + arg3]
 240
 241         ; get rid of the extra data that was loaded before
 242         ; load the shift constant
 243         lea             rax, [pshufb_shf_table]
 244         add             rax, arg3
 245         vmovdqu         xmm0, [rax]
 246
 247
 248         vpshufb         xmm7, xmm0
 249         vpxor           xmm0, [mask3]
 250         vpshufb         xmm2, xmm0
 251
 252         vpblendvb       xmm2, xmm2, xmm1, xmm0
 253         ;;;;;;;;;;
 254         vmovdqa         xmm8, xmm7
 255         vpclmulqdq      xmm7, xmm10, 0x1
 256
 257         vpclmulqdq      xmm8, xmm10, 0x10
 258         vpxor           xmm7, xmm8
 259         vpxor           xmm7, xmm2
 260
 261 _128_done:
 262         ; compute crc of a 128-bit value
 263         vmovdqa         xmm10, [rk5]
 264         vmovdqa         xmm0, xmm7
 265
 266         ;64b fold
 267         vpclmulqdq      xmm7, xmm10, 0
 268         vpsrldq         xmm0, 8
 269         vpxor           xmm7, xmm0
 270
 271         ;barrett reduction
 272 _barrett:
 273         vmovdqa         xmm1, xmm7
 274         vmovdqa         xmm10, [rk7]
 275
 276         vpclmulqdq      xmm7, xmm10, 0
 277         vmovdqa         xmm2, xmm7
 278         vpclmulqdq      xmm7, xmm10, 0x10
 279         vpslldq         xmm2, 8
 280         vpxor           xmm7, xmm2
 281         vpxor           xmm7, xmm1
 282         vpextrq         rax, xmm7, 1
 283
 284 _cleanup:
 285         not             rax
 286
 287
 288 %ifidn __OUTPUT_FORMAT__, win64
 289         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 290         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 291         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 292         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 293         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 294         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 295         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 296         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 297         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 298         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 299 %endif
 300         add             rsp, VARIABLE_OFFSET
 301         ret
 302
 303 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 304 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 305 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 306 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 307
 308 align 16
 309 _less_than_256:
 310
 311         ; check if there is enough buffer to be able to fold 16B at a time
 312         cmp     arg3, 32
 313         jl      _less_than_32
 314
 315         ; if there is, load the constants
 316         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 317
 318         vmovq   xmm0, arg1      ; get the initial crc value
 319         vmovdqu xmm7, [arg2]    ; load the plaintext
 320         vpxor   xmm7, xmm0
 321
 322         ; update the buffer pointer
 323         add     arg2, 16
 324
 325         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 326         sub     arg3, 32
 327
 328         jmp     _16B_reduction_loop
 329
 330 align 16
 331 _less_than_32:
 332         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 333         mov     rax, arg1
 334         test    arg3, arg3
 335         je      _cleanup
 336
 337         vmovq   xmm0, arg1      ; get the initial crc value
 338
 339         cmp     arg3, 16
 340         je      _exact_16_left
 341         jl      _less_than_16_left
 342
 343         vmovdqu xmm7, [arg2]    ; load the plaintext
 344         vpxor   xmm7, xmm0      ; xor the initial crc value
 345         add     arg2, 16
 346         sub     arg3, 16
 347         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 348         jmp     _get_last_two_xmms
 349
 350
 351 align 16
 352 _less_than_16_left:
 353         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 354
 355         vpxor   xmm1, xmm1
 356         mov     r11, rsp
 357         vmovdqa [r11], xmm1
 358
 359         ; backup the counter value
 360         mov     r9, arg3
 361         cmp     arg3, 8
 362         jl      _less_than_8_left
 363
 364         ; load 8 Bytes
 365         mov     rax, [arg2]
 366         mov     [r11], rax
 367         add     r11, 8
 368         sub     arg3, 8
 369         add     arg2, 8
 370 _less_than_8_left:
 371
 372         cmp     arg3, 4
 373         jl      _less_than_4_left
 374
 375         ; load 4 Bytes
 376         mov     eax, [arg2]
 377         mov     [r11], eax
 378         add     r11, 4
 379         sub     arg3, 4
 380         add     arg2, 4
 381 _less_than_4_left:
 382
 383         cmp     arg3, 2
 384         jl      _less_than_2_left
 385
 386         ; load 2 Bytes
 387         mov     ax, [arg2]
 388         mov     [r11], ax
 389         add     r11, 2
 390         sub     arg3, 2
 391         add     arg2, 2
 392 _less_than_2_left:
 393         cmp     arg3, 1
 394         jl      _zero_left
 395
 396         ; load 1 Byte
 397         mov     al, [arg2]
 398         mov     [r11], al
 399
 400 _zero_left:
 401         vmovdqa xmm7, [rsp]
 402         vpxor   xmm7, xmm0      ; xor the initial crc value
 403
 404         lea     rax,[pshufb_shf_table]
 405
 406         cmp     r9, 8
 407         jl      _end_1to7
 408
 409 _end_8to15:
 410         vmovdqu xmm0, [rax + r9]
 411         vpshufb xmm7,xmm0
 412         jmp     _128_done
 413
 414 _end_1to7:
 415         ; Left shift (8-length) bytes in XMM
 416         vmovdqu xmm0, [rax + r9 + 8]
 417         vpshufb xmm7,xmm0
 418
 419         jmp     _barrett
 420
 421 align 16
 422 _exact_16_left:
 423         vmovdqu xmm7, [arg2]
 424         vpxor   xmm7, xmm0      ; xor the initial crc value
 425
 426         jmp     _128_done
 427
 428 section .data
 429 align 32
 430
 431 %ifndef USE_CONSTS
 432 ; precomputed constants
 433 rk_1: dq 0x45000000b0000000
 434 rk_2: dq 0x6b700000f5000000
 435 rk1:  dq 0xf500000000000001
 436 rk2:  dq 0x6b70000000000001
 437 rk3:  dq 0xb001000000010000
 438 rk4:  dq 0xf501b0000001b000
 439 rk5:  dq 0xf500000000000001
 440 rk6:  dq 0x0000000000000000
 441 rk7:  dq 0xb000000000000001
 442 rk8:  dq 0xb000000000000000
 443 rk9:  dq 0xe014514514501501
 444 rk10: dq 0x771db6db6db71c71
 445 rk11: dq 0xa101101101110001
 446 rk12: dq 0x1ab1ab1ab1aab001
 447 rk13: dq 0xf445014445000001
 448 rk14: dq 0x6aab71daab700001
 449 rk15: dq 0xb100010100000001
 450 rk16: dq 0x01b001b1b0000001
 451 rk17: dq 0xe145150000000001
 452 rk18: dq 0x76db6c7000000001
 453 rk19: dq 0xa011000000000001
 454 rk20: dq 0x1b1ab00000000001
 455
 456 rk_1b: dq 0xf500000000000001
 457 rk_2b: dq 0x6b70000000000001
 458         dq 0x0000000000000000
 459         dq 0x0000000000000000
 460 %else
 461 INCLUDE_CONSTS
 462 %endif
 463
 464 pshufb_shf_table:
 465 ; use these values for shift constants for the pshufb instruction
 466 ; different alignments result in values as shown:
 467 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 468 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 469 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 470 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 471 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 472 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 473 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 474 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 475 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 476 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 477 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 478 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 479 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 480 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 481 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 482 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 483 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 484
 485 mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 486 mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 487 mask3: dq     0x8080808080808080, 0x8080808080808080
 488
 489 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 490 %ifidn __OUTPUT_FORMAT__, win64
 491 global no_ %+ FUNCTION_NAME
 492 no_ %+ FUNCTION_NAME %+ :
 493 %endif
 494 %endif ; (AS_FEATURE_LEVEL) >= 10