ceph/src/isa-l/crc/crc64_iso_refl_by16_10.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29
  30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  31 ;       Function API:
  32 ;       uint64_t crc64_iso_refl_by16_10(
  33 ;               uint64_t init_crc, //initial CRC value, 64 bits
  34 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  35 ;               uint64_t len //buffer length in bytes (64-bit data)
  36 ;       );
  37 ;
  38 %include "reg_sizes.asm"
  39
  40 %ifndef FUNCTION_NAME
  41 %define FUNCTION_NAME crc64_iso_refl_by16_10
  42 %endif
  43
  44 %if (AS_FEATURE_LEVEL) >= 10
  45
  46 %define fetch_dist      1024
  47
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52
  53
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62 %endif
  63
  64 %define TMP 16*0
  65 %ifidn __OUTPUT_FORMAT__, win64
  66         %define XMM_SAVE 16*2
  67         %define VARIABLE_OFFSET 16*12+8
  68 %else
  69         %define VARIABLE_OFFSET 16*2+8
  70 %endif
  71
  72 align 16
  73 mk_global FUNCTION_NAME, function
  74 FUNCTION_NAME:
  75         endbranch
  76         not             arg1
  77         sub             rsp, VARIABLE_OFFSET
  78
  79 %ifidn __OUTPUT_FORMAT__, win64
  80         ; push the xmm registers into the stack to maintain
  81         vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
  82         vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
  83         vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
  84         vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
  85         vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
  86         vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
  87         vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
  88         vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13
  89         vmovdqa         [rsp + XMM_SAVE + 16*8], xmm14
  90         vmovdqa         [rsp + XMM_SAVE + 16*9], xmm15
  91 %endif
  92
  93         cmp             arg3, 256
  94         jl              _less_than_256
  95
  96         ; load the initial crc value
  97         vmovq           xmm10, arg1      ; initial crc
  98
  99         ; receive the initial 128B data, xor the initial crc value
 100         vmovdqu8        zmm0, [arg2+16*0]
 101         vmovdqu8        zmm4, [arg2+16*4]
 102         vpxorq          zmm0, zmm10
 103         vbroadcasti32x4 zmm10, [rk3]    ;zmm10 has rk3 and rk4
 104                                         ;imm value of pclmulqdq instruction will determine which constant to use
 105
 106         sub             arg3, 256
 107         cmp             arg3, 256
 108         jl              _fold_128_B_loop
 109
 110         vmovdqu8        zmm7, [arg2+16*8]
 111         vmovdqu8        zmm8, [arg2+16*12]
 112         vbroadcasti32x4 zmm16, [rk_1]   ;zmm16 has rk-1 and rk-2
 113         sub             arg3, 256
 114
 115 _fold_256_B_loop:
 116         add             arg2, 256
 117         vmovdqu8        zmm3, [arg2+16*0]
 118         vpclmulqdq      zmm1, zmm0, zmm16, 0x10
 119         vpclmulqdq      zmm2, zmm0, zmm16, 0x01
 120         vpxorq          zmm0, zmm1, zmm2
 121         vpxorq          zmm0, zmm0, zmm3
 122
 123         vmovdqu8        zmm9, [arg2+16*4]
 124         vpclmulqdq      zmm5, zmm4, zmm16, 0x10
 125         vpclmulqdq      zmm6, zmm4, zmm16, 0x01
 126         vpxorq          zmm4, zmm5, zmm6
 127         vpxorq          zmm4, zmm4, zmm9
 128
 129         vmovdqu8        zmm11, [arg2+16*8]
 130         vpclmulqdq      zmm12, zmm7, zmm16, 0x10
 131         vpclmulqdq      zmm13, zmm7, zmm16, 0x01
 132         vpxorq          zmm7, zmm12, zmm13
 133         vpxorq          zmm7, zmm7, zmm11
 134
 135         vmovdqu8        zmm17, [arg2+16*12]
 136         vpclmulqdq      zmm14, zmm8, zmm16, 0x10
 137         vpclmulqdq      zmm15, zmm8, zmm16, 0x01
 138         vpxorq          zmm8, zmm14, zmm15
 139         vpxorq          zmm8, zmm8, zmm17
 140
 141         sub             arg3, 256
 142         jge             _fold_256_B_loop
 143
 144         ;; Fold 256 into 128
 145         add             arg2, 256
 146         vpclmulqdq      zmm1, zmm0, zmm10, 0x01
 147         vpclmulqdq      zmm2, zmm0, zmm10, 0x10
 148         vpternlogq      zmm7, zmm1, zmm2, 0x96  ; xor ABC
 149
 150         vpclmulqdq      zmm5, zmm4, zmm10, 0x01
 151         vpclmulqdq      zmm6, zmm4, zmm10, 0x10
 152         vpternlogq      zmm8, zmm5, zmm6, 0x96  ; xor ABC
 153
 154         vmovdqa32       zmm0, zmm7
 155         vmovdqa32       zmm4, zmm8
 156
 157         add             arg3, 128
 158         jmp             _fold_128_B_register
 159
 160         ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
 161 _fold_128_B_loop:
 162         add             arg2, 128       ; update the buffer pointer
 163         vmovdqu8        zmm8, [arg2+16*0]
 164         vpclmulqdq      zmm1, zmm0, zmm10, 0x10
 165         vpclmulqdq      zmm2, zmm0, zmm10, 0x01
 166         vpxorq          zmm0, zmm1, zmm2
 167         vpxorq          zmm0, zmm0, zmm8
 168
 169         vmovdqu8        zmm9, [arg2+16*4]
 170         vpclmulqdq      zmm5, zmm4, zmm10, 0x10
 171         vpclmulqdq      zmm6, zmm4, zmm10, 0x01
 172         vpxorq          zmm4, zmm5, zmm6
 173         vpxorq          zmm4, zmm4, zmm9
 174
 175         sub             arg3, 128
 176         jge             _fold_128_B_loop
 177         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 178
 179         add     arg2, 128
 180         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
 181         ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
 182
 183 _fold_128_B_register:
 184         ; fold the 8 128b parts into 1 xmm register with different constants
 185         vmovdqu8        zmm16, [rk9]            ; multiply by rk9-rk16
 186         vmovdqu8        zmm11, [rk17]           ; multiply by rk17-rk20, rk1,rk2, 0,0
 187         vpclmulqdq      zmm1, zmm0, zmm16, 0x01
 188         vpclmulqdq      zmm2, zmm0, zmm16, 0x10
 189         vextracti64x2   xmm7, zmm4, 3           ; save last that has no multiplicand
 190
 191         vpclmulqdq      zmm5, zmm4, zmm11, 0x01
 192         vpclmulqdq      zmm6, zmm4, zmm11, 0x10
 193         vmovdqa         xmm10, [rk1]            ; Needed later in reduction loop
 194         vpternlogq      zmm1, zmm2, zmm5, 0x96  ; xor ABC
 195         vpternlogq      zmm1, zmm6, zmm7, 0x96  ; xor ABC
 196
 197         vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
 198         vpxorq          ymm8, ymm8, ymm1
 199         vextracti64x2   xmm5, ymm8, 1
 200         vpxorq          xmm7, xmm5, xmm8
 201
 202         ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
 203         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 204         add             arg3, 128-16
 205         jl              _final_reduction_for_128
 206
 207         ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
 208         ; we can fold 16 bytes at a time if y>=16
 209         ; continue folding 16B at a time
 210
 211 _16B_reduction_loop:
 212         vmovdqa         xmm8, xmm7
 213         vpclmulqdq      xmm7, xmm10, 0x1
 214         vpclmulqdq      xmm8, xmm10, 0x10
 215         vpxor           xmm7, xmm8
 216         vmovdqu         xmm0, [arg2]
 217         vpxor           xmm7, xmm0
 218         add             arg2, 16
 219         sub             arg3, 16
 220         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 221         ; equivalent of: cmp arg3, 16-16
 222         ; check if there is any more 16B in the buffer to be able to fold
 223         jge             _16B_reduction_loop
 224
 225         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 226         ;first, we reduce the data in the xmm7 register
 227
 228
 229 _final_reduction_for_128:
 230         add             arg3, 16
 231         je              _128_done
 232         ; here we are getting data that is less than 16 bytes.
 233         ; since we know that there was data before the pointer, we can offset
 234         ; the input pointer before the actual point, to receive exactly 16 bytes.
 235         ; after that the registers need to be adjusted.
 236 _get_last_two_xmms:
 237
 238
 239         vmovdqa         xmm2, xmm7
 240         vmovdqu         xmm1, [arg2 - 16 + arg3]
 241
 242         ; get rid of the extra data that was loaded before
 243         ; load the shift constant
 244         lea             rax, [pshufb_shf_table]
 245         add             rax, arg3
 246         vmovdqu         xmm0, [rax]
 247
 248
 249         vpshufb         xmm7, xmm0
 250         vpxor           xmm0, [mask3]
 251         vpshufb         xmm2, xmm0
 252
 253         vpblendvb       xmm2, xmm2, xmm1, xmm0
 254         ;;;;;;;;;;
 255         vmovdqa         xmm8, xmm7
 256         vpclmulqdq      xmm7, xmm10, 0x1
 257
 258         vpclmulqdq      xmm8, xmm10, 0x10
 259         vpxor           xmm7, xmm8
 260         vpxor           xmm7, xmm2
 261
 262 _128_done:
 263         ; compute crc of a 128-bit value
 264         vmovdqa         xmm10, [rk5]
 265         vmovdqa         xmm0, xmm7
 266
 267         ;64b fold
 268         vpclmulqdq      xmm7, xmm10, 0
 269         vpsrldq         xmm0, 8
 270         vpxor           xmm7, xmm0
 271
 272         ;barrett reduction
 273 _barrett:
 274         vmovdqa         xmm1, xmm7
 275         vmovdqa         xmm10, [rk7]
 276
 277         vpclmulqdq      xmm7, xmm10, 0
 278         vmovdqa         xmm2, xmm7
 279         vpclmulqdq      xmm7, xmm10, 0x10
 280         vpslldq         xmm2, 8
 281         vpxor           xmm7, xmm2
 282         vpxor           xmm7, xmm1
 283         vpextrq         rax, xmm7, 1
 284
 285 _cleanup:
 286         not             rax
 287
 288
 289 %ifidn __OUTPUT_FORMAT__, win64
 290         vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
 291         vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
 292         vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
 293         vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
 294         vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
 295         vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
 296         vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
 297         vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
 298         vmovdqa         xmm14, [rsp + XMM_SAVE + 16*8]
 299         vmovdqa         xmm15, [rsp + XMM_SAVE + 16*9]
 300 %endif
 301         add             rsp, VARIABLE_OFFSET
 302         ret
 303
 304 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 305 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 306 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 307 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 308
 309 align 16
 310 _less_than_256:
 311
 312         ; check if there is enough buffer to be able to fold 16B at a time
 313         cmp     arg3, 32
 314         jl      _less_than_32
 315
 316         ; if there is, load the constants
 317         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 318
 319         vmovq   xmm0, arg1      ; get the initial crc value
 320         vmovdqu xmm7, [arg2]    ; load the plaintext
 321         vpxor   xmm7, xmm0
 322
 323         ; update the buffer pointer
 324         add     arg2, 16
 325
 326         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 327         sub     arg3, 32
 328
 329         jmp     _16B_reduction_loop
 330
 331 align 16
 332 _less_than_32:
 333         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 334         mov     rax, arg1
 335         test    arg3, arg3
 336         je      _cleanup
 337
 338         vmovq   xmm0, arg1      ; get the initial crc value
 339
 340         cmp     arg3, 16
 341         je      _exact_16_left
 342         jl      _less_than_16_left
 343
 344         vmovdqu xmm7, [arg2]    ; load the plaintext
 345         vpxor   xmm7, xmm0      ; xor the initial crc value
 346         add     arg2, 16
 347         sub     arg3, 16
 348         vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10
 349         jmp     _get_last_two_xmms
 350
 351
 352 align 16
 353 _less_than_16_left:
 354         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 355
 356         vpxor   xmm1, xmm1
 357         mov     r11, rsp
 358         vmovdqa [r11], xmm1
 359
 360         ; backup the counter value
 361         mov     r9, arg3
 362         cmp     arg3, 8
 363         jl      _less_than_8_left
 364
 365         ; load 8 Bytes
 366         mov     rax, [arg2]
 367         mov     [r11], rax
 368         add     r11, 8
 369         sub     arg3, 8
 370         add     arg2, 8
 371 _less_than_8_left:
 372
 373         cmp     arg3, 4
 374         jl      _less_than_4_left
 375
 376         ; load 4 Bytes
 377         mov     eax, [arg2]
 378         mov     [r11], eax
 379         add     r11, 4
 380         sub     arg3, 4
 381         add     arg2, 4
 382 _less_than_4_left:
 383
 384         cmp     arg3, 2
 385         jl      _less_than_2_left
 386
 387         ; load 2 Bytes
 388         mov     ax, [arg2]
 389         mov     [r11], ax
 390         add     r11, 2
 391         sub     arg3, 2
 392         add     arg2, 2
 393 _less_than_2_left:
 394         cmp     arg3, 1
 395         jl      _zero_left
 396
 397         ; load 1 Byte
 398         mov     al, [arg2]
 399         mov     [r11], al
 400
 401 _zero_left:
 402         vmovdqa xmm7, [rsp]
 403         vpxor   xmm7, xmm0      ; xor the initial crc value
 404
 405         lea     rax,[pshufb_shf_table]
 406
 407         cmp     r9, 8
 408         jl      _end_1to7
 409
 410 _end_8to15:
 411         vmovdqu xmm0, [rax + r9]
 412         vpshufb xmm7,xmm0
 413         jmp     _128_done
 414
 415 _end_1to7:
 416         ; Left shift (8-length) bytes in XMM
 417         vmovdqu xmm0, [rax + r9 + 8]
 418         vpshufb xmm7,xmm0
 419
 420         jmp     _barrett
 421
 422 align 16
 423 _exact_16_left:
 424         vmovdqu xmm7, [arg2]
 425         vpxor   xmm7, xmm0      ; xor the initial crc value
 426
 427         jmp     _128_done
 428
 429 section .data
 430 align 32
 431
 432 %ifndef USE_CONSTS
 433 ; precomputed constants
 434 rk_1: dq 0x45000000b0000000
 435 rk_2: dq 0x6b700000f5000000
 436 rk1:  dq 0xf500000000000001
 437 rk2:  dq 0x6b70000000000001
 438 rk3:  dq 0xb001000000010000
 439 rk4:  dq 0xf501b0000001b000
 440 rk5:  dq 0xf500000000000001
 441 rk6:  dq 0x0000000000000000
 442 rk7:  dq 0xb000000000000001
 443 rk8:  dq 0xb000000000000000
 444 rk9:  dq 0xe014514514501501
 445 rk10: dq 0x771db6db6db71c71
 446 rk11: dq 0xa101101101110001
 447 rk12: dq 0x1ab1ab1ab1aab001
 448 rk13: dq 0xf445014445000001
 449 rk14: dq 0x6aab71daab700001
 450 rk15: dq 0xb100010100000001
 451 rk16: dq 0x01b001b1b0000001
 452 rk17: dq 0xe145150000000001
 453 rk18: dq 0x76db6c7000000001
 454 rk19: dq 0xa011000000000001
 455 rk20: dq 0x1b1ab00000000001
 456
 457 rk_1b: dq 0xf500000000000001
 458 rk_2b: dq 0x6b70000000000001
 459         dq 0x0000000000000000
 460         dq 0x0000000000000000
 461 %else
 462 INCLUDE_CONSTS
 463 %endif
 464
 465 pshufb_shf_table:
 466 ; use these values for shift constants for the pshufb instruction
 467 ; different alignments result in values as shown:
 468 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 469 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 470 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 471 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 472 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 473 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 474 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 475 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 476 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 477 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 478 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 479 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 480 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 481 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 482 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 483 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 484 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 485
 486 mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 487 mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 488 mask3: dq     0x8080808080808080, 0x8080808080808080
 489
 490 %else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
 491 %ifidn __OUTPUT_FORMAT__, win64
 492 global no_ %+ FUNCTION_NAME
 493 no_ %+ FUNCTION_NAME %+ :
 494 %endif
 495 %endif ; (AS_FEATURE_LEVEL) >= 10