ceph/src/isa-l/crc/crc16_t10dif_by4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29 ;
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_by4(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  44 ;
  45
  46 %include "reg_sizes.asm"
  47
  48 %define fetch_dist      1024
  49
  50 [bits 64]
  51 default rel
  52
  53 section .text
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58
  59         %xdefine        arg1_low32 ecx
  60 %else
  61         %xdefine        arg1 rdi
  62         %xdefine        arg2 rsi
  63         %xdefine        arg3 rdx
  64
  65         %xdefine        arg1_low32 edi
  66 %endif
  67
  68 align 16
  69 mk_global       crc16_t10dif_by4, function
  70 crc16_t10dif_by4:
  71         endbranch
  72
  73         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  74         shl     arg1_low32, 16
  75
  76         ; After this point, code flow is exactly same as a 32-bit CRC.
  77         ; The only difference is before returning eax, we will shift
  78         ; it right 16 bits, to scale back to 16 bits.
  79
  80         sub     rsp,16*4+8
  81
  82         ; push the xmm registers into the stack to maintain
  83         movdqa [rsp+16*2],xmm6
  84         movdqa [rsp+16*3],xmm7
  85
  86         ; check if smaller than 128B
  87         cmp     arg3, 128
  88
  89         ; for sizes less than 128, we can't fold 64B at a time...
  90         jl      _less_than_128
  91
  92
  93         ; load the initial crc value
  94         movd    xmm6, arg1_low32        ; initial crc
  95
  96         ; crc value does not need to be byte-reflected, but it needs to
  97         ; be moved to the high part of the register.
  98         ; because data will be byte-reflected and will align with
  99         ; initial crc at correct place.
 100         pslldq  xmm6, 12
 101
 102         movdqa xmm7, [SHUF_MASK]
 103         ; receive the initial 64B data, xor the initial crc value
 104         movdqu  xmm0, [arg2]
 105         movdqu  xmm1, [arg2+16]
 106         movdqu  xmm2, [arg2+32]
 107         movdqu  xmm3, [arg2+48]
 108
 109         pshufb  xmm0, xmm7
 110         ; XOR the initial_crc value
 111         pxor    xmm0, xmm6
 112         pshufb  xmm1, xmm7
 113         pshufb  xmm2, xmm7
 114         pshufb  xmm3, xmm7
 115
 116         movdqa  xmm6, [rk3]     ;xmm6 has rk3 and rk4
 117                                         ;imm value of pclmulqdq instruction
 118                                         ;will determine which constant to use
 119         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 120         ; we subtract 128 instead of 64 to save one instruction from the loop
 121         sub     arg3, 128
 122
 123         ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
 124         ; buffer. The _fold_64_B_loop
 125         ; loop will fold 64B at a time until we have 64+y Bytes of buffer
 126
 127
 128         ; fold 64B at a time. This section of the code folds 4 xmm
 129         ; registers in parallel
 130 _fold_64_B_loop:
 131
 132         ; update the buffer pointer
 133         add     arg2, 64                ;    buf += 64;
 134
 135         prefetchnta [arg2+fetch_dist+0]
 136         movdqu  xmm4, xmm0
 137         movdqu  xmm5, xmm1
 138
 139         pclmulqdq       xmm0, xmm6 , 0x11
 140         pclmulqdq       xmm1, xmm6 , 0x11
 141
 142         pclmulqdq       xmm4, xmm6, 0x0
 143         pclmulqdq       xmm5, xmm6, 0x0
 144
 145         pxor    xmm0, xmm4
 146         pxor    xmm1, xmm5
 147
 148         prefetchnta [arg2+fetch_dist+32]
 149         movdqu  xmm4, xmm2
 150         movdqu  xmm5, xmm3
 151
 152         pclmulqdq       xmm2, xmm6, 0x11
 153         pclmulqdq       xmm3, xmm6, 0x11
 154
 155         pclmulqdq       xmm4, xmm6, 0x0
 156         pclmulqdq       xmm5, xmm6, 0x0
 157
 158         pxor    xmm2, xmm4
 159         pxor    xmm3, xmm5
 160
 161         movdqu  xmm4, [arg2]
 162         movdqu  xmm5, [arg2+16]
 163         pshufb  xmm4, xmm7
 164         pshufb  xmm5, xmm7
 165         pxor    xmm0, xmm4
 166         pxor    xmm1, xmm5
 167
 168         movdqu  xmm4, [arg2+32]
 169         movdqu  xmm5, [arg2+48]
 170         pshufb  xmm4, xmm7
 171         pshufb  xmm5, xmm7
 172
 173         pxor    xmm2, xmm4
 174         pxor    xmm3, xmm5
 175
 176         sub     arg3, 64
 177
 178         ; check if there is another 64B in the buffer to be able to fold
 179         jge     _fold_64_B_loop
 180         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 181
 182
 183         add     arg2, 64
 184         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 185         ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 186
 187
 188         ; fold the 4 xmm registers to 1 xmm register with different constants
 189
 190         movdqa  xmm6, [rk1]     ;xmm6 has rk1 and rk2
 191                                         ;imm value of pclmulqdq instruction will
 192                                         ;determine which constant to use
 193
 194         movdqa  xmm4, xmm0
 195         pclmulqdq       xmm0, xmm6, 0x11
 196         pclmulqdq       xmm4, xmm6, 0x0
 197         pxor    xmm1, xmm4
 198         pxor    xmm1, xmm0
 199
 200         movdqa  xmm4, xmm1
 201         pclmulqdq       xmm1, xmm6, 0x11
 202         pclmulqdq       xmm4, xmm6, 0x0
 203         pxor    xmm2, xmm4
 204         pxor    xmm2, xmm1
 205
 206         movdqa  xmm4, xmm2
 207         pclmulqdq       xmm2, xmm6, 0x11
 208         pclmulqdq       xmm4, xmm6, 0x0
 209         pxor    xmm3, xmm4
 210         pxor    xmm3, xmm2
 211
 212
 213         ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
 214         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 215         add     arg3, 64-16
 216         jl      _final_reduction_for_128
 217
 218         ; now we have 16+y bytes left to reduce. 16 Bytes
 219         ; is in register xmm3 and the rest is in memory
 220         ; we can fold 16 bytes at a time if y>=16
 221         ; continue folding 16B at a time
 222
 223 _16B_reduction_loop:
 224         movdqa  xmm4, xmm3
 225         pclmulqdq       xmm3, xmm6, 0x11
 226         pclmulqdq       xmm4, xmm6, 0x0
 227         pxor    xmm3, xmm4
 228         movdqu  xmm0, [arg2]
 229         pshufb  xmm0, xmm7
 230         pxor    xmm3, xmm0
 231         add     arg2, 16
 232         sub     arg3, 16
 233         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 234         ; equivalent of: cmp arg3, 16-16
 235         ; check if there is any more 16B in the buffer to be able to fold
 236         jge     _16B_reduction_loop
 237
 238         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 239         ;first, we reduce the data in the xmm3 register
 240
 241
 242 _final_reduction_for_128:
 243         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 244         add     arg3, 16
 245         je      _128_done
 246
 247         ; here we are getting data that is less than 16 bytes.
 248         ; since we know that there was data before the pointer,
 249         ; we can offset the input pointer before the actual point,
 250         ; to receive exactly 16 bytes.
 251         ; after that the registers need to be adjusted.
 252 _get_last_two_xmms:
 253         movdqa  xmm2, xmm3
 254
 255         movdqu  xmm1, [arg2 - 16 + arg3]
 256         pshufb  xmm1, xmm7
 257
 258         ; get rid of the extra data that was loaded before
 259         ; load the shift constant
 260         lea     rax, [pshufb_shf_table + 16]
 261         sub     rax, arg3
 262         movdqu  xmm0, [rax]
 263
 264         ; shift xmm2 to the left by arg3 bytes
 265         pshufb  xmm2, xmm0
 266
 267         ; shift xmm3 to the right by 16-arg3 bytes
 268         pxor    xmm0, [mask1]
 269         pshufb  xmm3, xmm0
 270         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 271
 272         ; fold 16 Bytes
 273         movdqa  xmm2, xmm1
 274         movdqa  xmm4, xmm3
 275         pclmulqdq       xmm3, xmm6, 0x11
 276         pclmulqdq       xmm4, xmm6, 0x0
 277         pxor    xmm3, xmm4
 278         pxor    xmm3, xmm2
 279
 280 _128_done:
 281         ; compute crc of a 128-bit value
 282         movdqa  xmm6, [rk5]     ; rk5 and rk6 in xmm6
 283         movdqa  xmm0, xmm3
 284
 285         ;64b fold
 286         pclmulqdq       xmm3, xmm6, 0x1
 287         pslldq  xmm0, 8
 288         pxor    xmm3, xmm0
 289
 290         ;32b fold
 291         movdqa  xmm0, xmm3
 292
 293         pand    xmm0, [mask2]
 294
 295         psrldq  xmm3, 12
 296         pclmulqdq       xmm3, xmm6, 0x10
 297         pxor    xmm3, xmm0
 298
 299         ;barrett reduction
 300 _barrett:
 301         movdqa  xmm6, [rk7]     ; rk7 and rk8 in xmm6
 302         movdqa  xmm0, xmm3
 303         pclmulqdq       xmm3, xmm6, 0x01
 304         pslldq  xmm3, 4
 305         pclmulqdq       xmm3, xmm6, 0x11
 306
 307         pslldq  xmm3, 4
 308         pxor    xmm3, xmm0
 309         pextrd  eax, xmm3,1
 310
 311 _cleanup:
 312         ; scale the result back to 16 bits
 313         shr     eax, 16
 314         movdqa  xmm6, [rsp+16*2]
 315         movdqa  xmm7, [rsp+16*3]
 316         add     rsp,16*4+8
 317         ret
 318
 319
 320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 322 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 324
 325 align 16
 326 _less_than_128:
 327
 328         ; check if there is enough buffer to be able to fold 16B at a time
 329         cmp     arg3, 32
 330         jl      _less_than_32
 331         movdqa xmm7, [SHUF_MASK]
 332
 333         ; if there is, load the constants
 334         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 335
 336         movd    xmm0, arg1_low32        ; get the initial crc value
 337         pslldq  xmm0, 12        ; align it to its correct place
 338         movdqu  xmm3, [arg2]    ; load the plaintext
 339         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 340         pxor    xmm3, xmm0
 341
 342
 343         ; update the buffer pointer
 344         add     arg2, 16
 345
 346         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 347         sub     arg3, 32
 348
 349         jmp     _16B_reduction_loop
 350
 351
 352 align 16
 353 _less_than_32:
 354         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 355         mov     eax, arg1_low32
 356         test    arg3, arg3
 357         je      _cleanup
 358
 359         movdqa xmm7, [SHUF_MASK]
 360
 361         movd    xmm0, arg1_low32        ; get the initial crc value
 362         pslldq  xmm0, 12                ; align it to its correct place
 363
 364         cmp     arg3, 16
 365         je      _exact_16_left
 366         jl      _less_than_16_left
 367
 368         movdqu  xmm3, [arg2]    ; load the plaintext
 369         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 370         pxor    xmm3, xmm0      ; xor the initial crc value
 371         add     arg2, 16
 372         sub     arg3, 16
 373         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 374         jmp     _get_last_two_xmms
 375
 376
 377 align 16
 378 _less_than_16_left:
 379         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 380
 381         pxor    xmm1, xmm1
 382         mov     r11, rsp
 383         movdqa  [r11], xmm1
 384
 385         cmp     arg3, 4
 386         jl      _only_less_than_4
 387
 388         ;       backup the counter value
 389         mov     r9, arg3
 390         cmp     arg3, 8
 391         jl      _less_than_8_left
 392
 393         ; load 8 Bytes
 394         mov     rax, [arg2]
 395         mov     [r11], rax
 396         add     r11, 8
 397         sub     arg3, 8
 398         add     arg2, 8
 399 _less_than_8_left:
 400
 401         cmp     arg3, 4
 402         jl      _less_than_4_left
 403
 404         ; load 4 Bytes
 405         mov     eax, [arg2]
 406         mov     [r11], eax
 407         add     r11, 4
 408         sub     arg3, 4
 409         add     arg2, 4
 410 _less_than_4_left:
 411
 412         cmp     arg3, 2
 413         jl      _less_than_2_left
 414
 415         ; load 2 Bytes
 416         mov     ax, [arg2]
 417         mov     [r11], ax
 418         add     r11, 2
 419         sub     arg3, 2
 420         add     arg2, 2
 421 _less_than_2_left:
 422         cmp     arg3, 1
 423         jl      _zero_left
 424
 425         ; load 1 Byte
 426         mov     al, [arg2]
 427         mov     [r11], al
 428 _zero_left:
 429         movdqa  xmm3, [rsp]
 430         pshufb  xmm3, xmm7
 431         pxor    xmm3, xmm0      ; xor the initial crc value
 432
 433         ; shl r9, 4
 434         lea     rax, [pshufb_shf_table + 16]
 435         sub     rax, r9
 436         movdqu  xmm0, [rax]
 437         pxor    xmm0, [mask1]
 438
 439         pshufb  xmm3, xmm0
 440         jmp     _128_done
 441
 442 align 16
 443 _exact_16_left:
 444         movdqu  xmm3, [arg2]
 445         pshufb  xmm3, xmm7
 446         pxor    xmm3, xmm0      ; xor the initial crc value
 447
 448         jmp     _128_done
 449
 450 _only_less_than_4:
 451         cmp     arg3, 3
 452         jl      _only_less_than_3
 453
 454         ; load 3 Bytes
 455         mov     al, [arg2]
 456         mov     [r11], al
 457
 458         mov     al, [arg2+1]
 459         mov     [r11+1], al
 460
 461         mov     al, [arg2+2]
 462         mov     [r11+2], al
 463
 464         movdqa  xmm3, [rsp]
 465         pshufb  xmm3, xmm7
 466         pxor    xmm3, xmm0      ; xor the initial crc value
 467
 468         psrldq  xmm3, 5
 469
 470         jmp     _barrett
 471 _only_less_than_3:
 472         cmp     arg3, 2
 473         jl      _only_less_than_2
 474
 475         ; load 2 Bytes
 476         mov     al, [arg2]
 477         mov     [r11], al
 478
 479         mov     al, [arg2+1]
 480         mov     [r11+1], al
 481
 482         movdqa  xmm3, [rsp]
 483         pshufb  xmm3, xmm7
 484         pxor    xmm3, xmm0      ; xor the initial crc value
 485
 486         psrldq  xmm3, 6
 487
 488         jmp     _barrett
 489 _only_less_than_2:
 490
 491         ; load 1 Byte
 492         mov     al, [arg2]
 493         mov     [r11], al
 494
 495         movdqa  xmm3, [rsp]
 496         pshufb  xmm3, xmm7
 497         pxor    xmm3, xmm0      ; xor the initial crc value
 498
 499         psrldq  xmm3, 7
 500
 501         jmp     _barrett
 502
 503 section .data
 504
 505 ; precomputed constants
 506 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 507 align 16
 508 ; Q = 0x18BB70000
 509 ; rk1 = 2^(32*3) mod Q << 32
 510 ; rk2 = 2^(32*5) mod Q << 32
 511 ; rk3 = 2^(32*15) mod Q << 32
 512 ; rk4 = 2^(32*17) mod Q << 32
 513 ; rk5 = 2^(32*3) mod Q << 32
 514 ; rk6 = 2^(32*2) mod Q << 32
 515 ; rk7 = floor(2^64/Q)
 516 ; rk8 = Q
 517 rk1:
 518 DQ 0x2d56000000000000
 519 rk2:
 520 DQ 0x06df000000000000
 521 rk3:
 522 DQ 0x044c000000000000
 523 rk4:
 524 DQ 0xe658000000000000
 525 rk5:
 526 DQ 0x2d56000000000000
 527 rk6:
 528 DQ 0x1368000000000000
 529 rk7:
 530 DQ 0x00000001f65a57f8
 531 rk8:
 532 DQ 0x000000018bb70000
 533 mask1:
 534 dq 0x8080808080808080, 0x8080808080808080
 535 mask2:
 536 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 537
 538 SHUF_MASK:
 539 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 540
 541 pshufb_shf_table:
 542 ; use these values for shift constants for the pshufb instruction
 543 ; different alignments result in values as shown:
 544 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 545 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 546 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 547 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 548 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 549 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 550 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 551 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 552 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 553 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 554 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 555 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 556 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 557 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 558 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 559 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 560 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 561
 562 ;;;       func             core, ver, snum
 563 slversion crc16_t10dif_by4, 05,   02,  0016