ceph/src/isa-l/crc/crc16_t10dif_by4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29 ;
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_by4(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  44 ;
  45
  46 %include "reg_sizes.asm"
  47
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52 %ifidn __OUTPUT_FORMAT__, win64
  53         %xdefine        arg1 rcx
  54         %xdefine        arg2 rdx
  55         %xdefine        arg3 r8
  56
  57         %xdefine        arg1_low32 ecx
  58 %else
  59         %xdefine        arg1 rdi
  60         %xdefine        arg2 rsi
  61         %xdefine        arg3 rdx
  62
  63         %xdefine        arg1_low32 edi
  64 %endif
  65
  66 align 16
  67 global  crc16_t10dif_by4:function
  68 crc16_t10dif_by4:
  69
  70         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  71         shl     arg1_low32, 16
  72
  73         ; After this point, code flow is exactly same as a 32-bit CRC.
  74         ; The only difference is before returning eax, we will shift
  75         ; it right 16 bits, to scale back to 16 bits.
  76
  77         sub     rsp,16*4+8
  78
  79         ; push the xmm registers into the stack to maintain
  80         movdqa [rsp+16*2],xmm6
  81         movdqa [rsp+16*3],xmm7
  82
  83         ; check if smaller than 128B
  84         cmp     arg3, 128
  85
  86         ; for sizes less than 128, we can't fold 64B at a time...
  87         jl      _less_than_128
  88
  89
  90         ; load the initial crc value
  91         movd    xmm6, arg1_low32        ; initial crc
  92
  93         ; crc value does not need to be byte-reflected, but it needs to
  94         ; be moved to the high part of the register.
  95         ; because data will be byte-reflected and will align with
  96         ; initial crc at correct place.
  97         pslldq  xmm6, 12
  98
  99         movdqa xmm7, [SHUF_MASK]
 100         ; receive the initial 64B data, xor the initial crc value
 101         movdqu  xmm0, [arg2]
 102         movdqu  xmm1, [arg2+16]
 103         movdqu  xmm2, [arg2+32]
 104         movdqu  xmm3, [arg2+48]
 105
 106         pshufb  xmm0, xmm7
 107         ; XOR the initial_crc value
 108         pxor    xmm0, xmm6
 109         pshufb  xmm1, xmm7
 110         pshufb  xmm2, xmm7
 111         pshufb  xmm3, xmm7
 112
 113         movdqa  xmm6, [rk3]     ;xmm6 has rk3 and rk4
 114                                         ;imm value of pclmulqdq instruction
 115                                         ;will determine which constant to use
 116         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 117         ; we subtract 128 instead of 64 to save one instruction from the loop
 118         sub     arg3, 128
 119
 120         ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
 121         ; buffer. The _fold_64_B_loop
 122         ; loop will fold 64B at a time until we have 64+y Bytes of buffer
 123
 124
 125         ; fold 64B at a time. This section of the code folds 4 xmm
 126         ; registers in parallel
 127 _fold_64_B_loop:
 128
 129         ; update the buffer pointer
 130         add     arg2, 64                ;    buf += 64;
 131
 132         movdqu  xmm4, xmm0
 133         movdqu  xmm5, xmm1
 134
 135         pclmulqdq       xmm0, xmm6 , 0x11
 136         pclmulqdq       xmm1, xmm6 , 0x11
 137
 138         pclmulqdq       xmm4, xmm6, 0x0
 139         pclmulqdq       xmm5, xmm6, 0x0
 140
 141         pxor    xmm0, xmm4
 142         pxor    xmm1, xmm5
 143
 144         movdqu  xmm4, xmm2
 145         movdqu  xmm5, xmm3
 146
 147         pclmulqdq       xmm2, xmm6, 0x11
 148         pclmulqdq       xmm3, xmm6, 0x11
 149
 150         pclmulqdq       xmm4, xmm6, 0x0
 151         pclmulqdq       xmm5, xmm6, 0x0
 152
 153         pxor    xmm2, xmm4
 154         pxor    xmm3, xmm5
 155
 156         movdqu  xmm4, [arg2]
 157         movdqu  xmm5, [arg2+16]
 158         pshufb  xmm4, xmm7
 159         pshufb  xmm5, xmm7
 160         pxor    xmm0, xmm4
 161         pxor    xmm1, xmm5
 162
 163         movdqu  xmm4, [arg2+32]
 164         movdqu  xmm5, [arg2+48]
 165         pshufb  xmm4, xmm7
 166         pshufb  xmm5, xmm7
 167
 168         pxor    xmm2, xmm4
 169         pxor    xmm3, xmm5
 170
 171         sub     arg3, 64
 172
 173         ; check if there is another 64B in the buffer to be able to fold
 174         jge     _fold_64_B_loop
 175         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 176
 177
 178         add     arg2, 64
 179         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 180         ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 181
 182
 183         ; fold the 4 xmm registers to 1 xmm register with different constants
 184
 185         movdqa  xmm6, [rk1]     ;xmm6 has rk1 and rk2
 186                                         ;imm value of pclmulqdq instruction will
 187                                         ;determine which constant to use
 188
 189         movdqa  xmm4, xmm0
 190         pclmulqdq       xmm0, xmm6, 0x11
 191         pclmulqdq       xmm4, xmm6, 0x0
 192         pxor    xmm1, xmm4
 193         pxor    xmm1, xmm0
 194
 195         movdqa  xmm4, xmm1
 196         pclmulqdq       xmm1, xmm6, 0x11
 197         pclmulqdq       xmm4, xmm6, 0x0
 198         pxor    xmm2, xmm4
 199         pxor    xmm2, xmm1
 200
 201         movdqa  xmm4, xmm2
 202         pclmulqdq       xmm2, xmm6, 0x11
 203         pclmulqdq       xmm4, xmm6, 0x0
 204         pxor    xmm3, xmm4
 205         pxor    xmm3, xmm2
 206
 207
 208         ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
 209         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 210         add     arg3, 64-16
 211         jl      _final_reduction_for_128
 212
 213         ; now we have 16+y bytes left to reduce. 16 Bytes
 214         ; is in register xmm3 and the rest is in memory
 215         ; we can fold 16 bytes at a time if y>=16
 216         ; continue folding 16B at a time
 217
 218 _16B_reduction_loop:
 219         movdqa  xmm4, xmm3
 220         pclmulqdq       xmm3, xmm6, 0x11
 221         pclmulqdq       xmm4, xmm6, 0x0
 222         pxor    xmm3, xmm4
 223         movdqu  xmm0, [arg2]
 224         pshufb  xmm0, xmm7
 225         pxor    xmm3, xmm0
 226         add     arg2, 16
 227         sub     arg3, 16
 228         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 229         ; equivalent of: cmp arg3, 16-16
 230         ; check if there is any more 16B in the buffer to be able to fold
 231         jge     _16B_reduction_loop
 232
 233         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 234         ;first, we reduce the data in the xmm3 register
 235
 236
 237 _final_reduction_for_128:
 238         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 239         add     arg3, 16
 240         je      _128_done
 241
 242         ; here we are getting data that is less than 16 bytes.
 243         ; since we know that there was data before the pointer,
 244         ; we can offset the input pointer before the actual point,
 245         ; to receive exactly 16 bytes.
 246         ; after that the registers need to be adjusted.
 247 _get_last_two_xmms:
 248         movdqa  xmm2, xmm3
 249
 250         movdqu  xmm1, [arg2 - 16 + arg3]
 251         pshufb  xmm1, xmm7
 252
 253         ; get rid of the extra data that was loaded before
 254         ; load the shift constant
 255         lea     rax, [pshufb_shf_table + 16]
 256         sub     rax, arg3
 257         movdqu  xmm0, [rax]
 258
 259         ; shift xmm2 to the left by arg3 bytes
 260         pshufb  xmm2, xmm0
 261
 262         ; shift xmm3 to the right by 16-arg3 bytes
 263         pxor    xmm0, [mask1]
 264         pshufb  xmm3, xmm0
 265         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 266
 267         ; fold 16 Bytes
 268         movdqa  xmm2, xmm1
 269         movdqa  xmm4, xmm3
 270         pclmulqdq       xmm3, xmm6, 0x11
 271         pclmulqdq       xmm4, xmm6, 0x0
 272         pxor    xmm3, xmm4
 273         pxor    xmm3, xmm2
 274
 275 _128_done:
 276         ; compute crc of a 128-bit value
 277         movdqa  xmm6, [rk5]     ; rk5 and rk6 in xmm6
 278         movdqa  xmm0, xmm3
 279
 280         ;64b fold
 281         pclmulqdq       xmm3, xmm6, 0x1
 282         pslldq  xmm0, 8
 283         pxor    xmm3, xmm0
 284
 285         ;32b fold
 286         movdqa  xmm0, xmm3
 287
 288         pand    xmm0, [mask2]
 289
 290         psrldq  xmm3, 12
 291         pclmulqdq       xmm3, xmm6, 0x10
 292         pxor    xmm3, xmm0
 293
 294         ;barrett reduction
 295 _barrett:
 296         movdqa  xmm6, [rk7]     ; rk7 and rk8 in xmm6
 297         movdqa  xmm0, xmm3
 298         pclmulqdq       xmm3, xmm6, 0x01
 299         pslldq  xmm3, 4
 300         pclmulqdq       xmm3, xmm6, 0x11
 301
 302         pslldq  xmm3, 4
 303         pxor    xmm3, xmm0
 304         pextrd  eax, xmm3,1
 305
 306 _cleanup:
 307         ; scale the result back to 16 bits
 308         shr     eax, 16
 309         movdqa  xmm6, [rsp+16*2]
 310         movdqa  xmm7, [rsp+16*3]
 311         add     rsp,16*4+8
 312         ret
 313
 314
 315 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 316 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 318 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 319
 320 align 16
 321 _less_than_128:
 322
 323         ; check if there is enough buffer to be able to fold 16B at a time
 324         cmp     arg3, 32
 325         jl      _less_than_32
 326         movdqa xmm7, [SHUF_MASK]
 327
 328         ; if there is, load the constants
 329         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 330
 331         movd    xmm0, arg1_low32        ; get the initial crc value
 332         pslldq  xmm0, 12        ; align it to its correct place
 333         movdqu  xmm3, [arg2]    ; load the plaintext
 334         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 335         pxor    xmm3, xmm0
 336
 337
 338         ; update the buffer pointer
 339         add     arg2, 16
 340
 341         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 342         sub     arg3, 32
 343
 344         jmp     _16B_reduction_loop
 345
 346
 347 align 16
 348 _less_than_32:
 349         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 350         mov     eax, arg1_low32
 351         test    arg3, arg3
 352         je      _cleanup
 353
 354         movdqa xmm7, [SHUF_MASK]
 355
 356         movd    xmm0, arg1_low32        ; get the initial crc value
 357         pslldq  xmm0, 12                ; align it to its correct place
 358
 359         cmp     arg3, 16
 360         je      _exact_16_left
 361         jl      _less_than_16_left
 362
 363         movdqu  xmm3, [arg2]    ; load the plaintext
 364         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 365         pxor    xmm3, xmm0      ; xor the initial crc value
 366         add     arg2, 16
 367         sub     arg3, 16
 368         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 369         jmp     _get_last_two_xmms
 370
 371
 372 align 16
 373 _less_than_16_left:
 374         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 375
 376         pxor    xmm1, xmm1
 377         mov     r11, rsp
 378         movdqa  [r11], xmm1
 379
 380         cmp     arg3, 4
 381         jl      _only_less_than_4
 382
 383         ;       backup the counter value
 384         mov     r9, arg3
 385         cmp     arg3, 8
 386         jl      _less_than_8_left
 387
 388         ; load 8 Bytes
 389         mov     rax, [arg2]
 390         mov     [r11], rax
 391         add     r11, 8
 392         sub     arg3, 8
 393         add     arg2, 8
 394 _less_than_8_left:
 395
 396         cmp     arg3, 4
 397         jl      _less_than_4_left
 398
 399         ; load 4 Bytes
 400         mov     eax, [arg2]
 401         mov     [r11], eax
 402         add     r11, 4
 403         sub     arg3, 4
 404         add     arg2, 4
 405 _less_than_4_left:
 406
 407         cmp     arg3, 2
 408         jl      _less_than_2_left
 409
 410         ; load 2 Bytes
 411         mov     ax, [arg2]
 412         mov     [r11], ax
 413         add     r11, 2
 414         sub     arg3, 2
 415         add     arg2, 2
 416 _less_than_2_left:
 417         cmp     arg3, 1
 418         jl      _zero_left
 419
 420         ; load 1 Byte
 421         mov     al, [arg2]
 422         mov     [r11], al
 423 _zero_left:
 424         movdqa  xmm3, [rsp]
 425         pshufb  xmm3, xmm7
 426         pxor    xmm3, xmm0      ; xor the initial crc value
 427
 428         ; shl r9, 4
 429         lea     rax, [pshufb_shf_table + 16]
 430         sub     rax, r9
 431         movdqu  xmm0, [rax]
 432         pxor    xmm0, [mask1]
 433
 434         pshufb  xmm3, xmm0
 435         jmp     _128_done
 436
 437 align 16
 438 _exact_16_left:
 439         movdqu  xmm3, [arg2]
 440         pshufb  xmm3, xmm7
 441         pxor    xmm3, xmm0      ; xor the initial crc value
 442
 443         jmp     _128_done
 444
 445 _only_less_than_4:
 446         cmp     arg3, 3
 447         jl      _only_less_than_3
 448
 449         ; load 3 Bytes
 450         mov     al, [arg2]
 451         mov     [r11], al
 452
 453         mov     al, [arg2+1]
 454         mov     [r11+1], al
 455
 456         mov     al, [arg2+2]
 457         mov     [r11+2], al
 458
 459         movdqa  xmm3, [rsp]
 460         pshufb  xmm3, xmm7
 461         pxor    xmm3, xmm0      ; xor the initial crc value
 462
 463         psrldq  xmm3, 5
 464
 465         jmp     _barrett
 466 _only_less_than_3:
 467         cmp     arg3, 2
 468         jl      _only_less_than_2
 469
 470         ; load 2 Bytes
 471         mov     al, [arg2]
 472         mov     [r11], al
 473
 474         mov     al, [arg2+1]
 475         mov     [r11+1], al
 476
 477         movdqa  xmm3, [rsp]
 478         pshufb  xmm3, xmm7
 479         pxor    xmm3, xmm0      ; xor the initial crc value
 480
 481         psrldq  xmm3, 6
 482
 483         jmp     _barrett
 484 _only_less_than_2:
 485
 486         ; load 1 Byte
 487         mov     al, [arg2]
 488         mov     [r11], al
 489
 490         movdqa  xmm3, [rsp]
 491         pshufb  xmm3, xmm7
 492         pxor    xmm3, xmm0      ; xor the initial crc value
 493
 494         psrldq  xmm3, 7
 495
 496         jmp     _barrett
 497
 498 section .data
 499
 500 ; precomputed constants
 501 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 502 align 16
 503 ; Q = 0x18BB70000
 504 ; rk1 = 2^(32*3) mod Q << 32
 505 ; rk2 = 2^(32*5) mod Q << 32
 506 ; rk3 = 2^(32*15) mod Q << 32
 507 ; rk4 = 2^(32*17) mod Q << 32
 508 ; rk5 = 2^(32*3) mod Q << 32
 509 ; rk6 = 2^(32*2) mod Q << 32
 510 ; rk7 = floor(2^64/Q)
 511 ; rk8 = Q
 512 rk1:
 513 DQ 0x2d56000000000000
 514 rk2:
 515 DQ 0x06df000000000000
 516 rk3:
 517 DQ 0x044c000000000000
 518 rk4:
 519 DQ 0xe658000000000000
 520 rk5:
 521 DQ 0x2d56000000000000
 522 rk6:
 523 DQ 0x1368000000000000
 524 rk7:
 525 DQ 0x00000001f65a57f8
 526 rk8:
 527 DQ 0x000000018bb70000
 528 mask1:
 529 dq 0x8080808080808080, 0x8080808080808080
 530 mask2:
 531 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 532
 533 SHUF_MASK:
 534 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 535
 536 pshufb_shf_table:
 537 ; use these values for shift constants for the pshufb instruction
 538 ; different alignments result in values as shown:
 539 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 540 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 541 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 542 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 543 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 544 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 545 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 546 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 547 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 548 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 549 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 550 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 551 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 552 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 553 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 554 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 555 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 556
 557 ;;;       func             core, ver, snum
 558 slversion crc16_t10dif_by4, 05,   02,  0016