ceph/src/isa-l/crc/crc16_t10dif_by4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29 ;
  30 ;       Function API:
  31 ;       UINT16 crc16_t10dif_by4(
  32 ;               UINT16 init_crc, //initial CRC value, 16 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  44 ;
  45
  46 %include "reg_sizes.asm"
  47
  48 %define fetch_dist      1024
  49
  50 [bits 64]
  51 default rel
  52
  53 section .text
  54 %ifidn __OUTPUT_FORMAT__, win64
  55         %xdefine        arg1 rcx
  56         %xdefine        arg2 rdx
  57         %xdefine        arg3 r8
  58
  59         %xdefine        arg1_low32 ecx
  60 %else
  61         %xdefine        arg1 rdi
  62         %xdefine        arg2 rsi
  63         %xdefine        arg3 rdx
  64
  65         %xdefine        arg1_low32 edi
  66 %endif
  67
  68 align 16
  69 global  crc16_t10dif_by4:function
  70 crc16_t10dif_by4:
  71
  72         ; adjust the 16-bit initial_crc value, scale it to 32 bits
  73         shl     arg1_low32, 16
  74
  75         ; After this point, code flow is exactly same as a 32-bit CRC.
  76         ; The only difference is before returning eax, we will shift
  77         ; it right 16 bits, to scale back to 16 bits.
  78
  79         sub     rsp,16*4+8
  80
  81         ; push the xmm registers into the stack to maintain
  82         movdqa [rsp+16*2],xmm6
  83         movdqa [rsp+16*3],xmm7
  84
  85         ; check if smaller than 128B
  86         cmp     arg3, 128
  87
  88         ; for sizes less than 128, we can't fold 64B at a time...
  89         jl      _less_than_128
  90
  91
  92         ; load the initial crc value
  93         movd    xmm6, arg1_low32        ; initial crc
  94
  95         ; crc value does not need to be byte-reflected, but it needs to
  96         ; be moved to the high part of the register.
  97         ; because data will be byte-reflected and will align with
  98         ; initial crc at correct place.
  99         pslldq  xmm6, 12
 100
 101         movdqa xmm7, [SHUF_MASK]
 102         ; receive the initial 64B data, xor the initial crc value
 103         movdqu  xmm0, [arg2]
 104         movdqu  xmm1, [arg2+16]
 105         movdqu  xmm2, [arg2+32]
 106         movdqu  xmm3, [arg2+48]
 107
 108         pshufb  xmm0, xmm7
 109         ; XOR the initial_crc value
 110         pxor    xmm0, xmm6
 111         pshufb  xmm1, xmm7
 112         pshufb  xmm2, xmm7
 113         pshufb  xmm3, xmm7
 114
 115         movdqa  xmm6, [rk3]     ;xmm6 has rk3 and rk4
 116                                         ;imm value of pclmulqdq instruction
 117                                         ;will determine which constant to use
 118         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 119         ; we subtract 128 instead of 64 to save one instruction from the loop
 120         sub     arg3, 128
 121
 122         ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
 123         ; buffer. The _fold_64_B_loop
 124         ; loop will fold 64B at a time until we have 64+y Bytes of buffer
 125
 126
 127         ; fold 64B at a time. This section of the code folds 4 xmm
 128         ; registers in parallel
 129 _fold_64_B_loop:
 130
 131         ; update the buffer pointer
 132         add     arg2, 64                ;    buf += 64;
 133
 134         prefetchnta [arg2+fetch_dist+0]
 135         movdqu  xmm4, xmm0
 136         movdqu  xmm5, xmm1
 137
 138         pclmulqdq       xmm0, xmm6 , 0x11
 139         pclmulqdq       xmm1, xmm6 , 0x11
 140
 141         pclmulqdq       xmm4, xmm6, 0x0
 142         pclmulqdq       xmm5, xmm6, 0x0
 143
 144         pxor    xmm0, xmm4
 145         pxor    xmm1, xmm5
 146
 147         prefetchnta [arg2+fetch_dist+32]
 148         movdqu  xmm4, xmm2
 149         movdqu  xmm5, xmm3
 150
 151         pclmulqdq       xmm2, xmm6, 0x11
 152         pclmulqdq       xmm3, xmm6, 0x11
 153
 154         pclmulqdq       xmm4, xmm6, 0x0
 155         pclmulqdq       xmm5, xmm6, 0x0
 156
 157         pxor    xmm2, xmm4
 158         pxor    xmm3, xmm5
 159
 160         movdqu  xmm4, [arg2]
 161         movdqu  xmm5, [arg2+16]
 162         pshufb  xmm4, xmm7
 163         pshufb  xmm5, xmm7
 164         pxor    xmm0, xmm4
 165         pxor    xmm1, xmm5
 166
 167         movdqu  xmm4, [arg2+32]
 168         movdqu  xmm5, [arg2+48]
 169         pshufb  xmm4, xmm7
 170         pshufb  xmm5, xmm7
 171
 172         pxor    xmm2, xmm4
 173         pxor    xmm3, xmm5
 174
 175         sub     arg3, 64
 176
 177         ; check if there is another 64B in the buffer to be able to fold
 178         jge     _fold_64_B_loop
 179         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 180
 181
 182         add     arg2, 64
 183         ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
 184         ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 185
 186
 187         ; fold the 4 xmm registers to 1 xmm register with different constants
 188
 189         movdqa  xmm6, [rk1]     ;xmm6 has rk1 and rk2
 190                                         ;imm value of pclmulqdq instruction will
 191                                         ;determine which constant to use
 192
 193         movdqa  xmm4, xmm0
 194         pclmulqdq       xmm0, xmm6, 0x11
 195         pclmulqdq       xmm4, xmm6, 0x0
 196         pxor    xmm1, xmm4
 197         pxor    xmm1, xmm0
 198
 199         movdqa  xmm4, xmm1
 200         pclmulqdq       xmm1, xmm6, 0x11
 201         pclmulqdq       xmm4, xmm6, 0x0
 202         pxor    xmm2, xmm4
 203         pxor    xmm2, xmm1
 204
 205         movdqa  xmm4, xmm2
 206         pclmulqdq       xmm2, xmm6, 0x11
 207         pclmulqdq       xmm4, xmm6, 0x0
 208         pxor    xmm3, xmm4
 209         pxor    xmm3, xmm2
 210
 211
 212         ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
 213         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 214         add     arg3, 64-16
 215         jl      _final_reduction_for_128
 216
 217         ; now we have 16+y bytes left to reduce. 16 Bytes
 218         ; is in register xmm3 and the rest is in memory
 219         ; we can fold 16 bytes at a time if y>=16
 220         ; continue folding 16B at a time
 221
 222 _16B_reduction_loop:
 223         movdqa  xmm4, xmm3
 224         pclmulqdq       xmm3, xmm6, 0x11
 225         pclmulqdq       xmm4, xmm6, 0x0
 226         pxor    xmm3, xmm4
 227         movdqu  xmm0, [arg2]
 228         pshufb  xmm0, xmm7
 229         pxor    xmm3, xmm0
 230         add     arg2, 16
 231         sub     arg3, 16
 232         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 233         ; equivalent of: cmp arg3, 16-16
 234         ; check if there is any more 16B in the buffer to be able to fold
 235         jge     _16B_reduction_loop
 236
 237         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 238         ;first, we reduce the data in the xmm3 register
 239
 240
 241 _final_reduction_for_128:
 242         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 243         add     arg3, 16
 244         je      _128_done
 245
 246         ; here we are getting data that is less than 16 bytes.
 247         ; since we know that there was data before the pointer,
 248         ; we can offset the input pointer before the actual point,
 249         ; to receive exactly 16 bytes.
 250         ; after that the registers need to be adjusted.
 251 _get_last_two_xmms:
 252         movdqa  xmm2, xmm3
 253
 254         movdqu  xmm1, [arg2 - 16 + arg3]
 255         pshufb  xmm1, xmm7
 256
 257         ; get rid of the extra data that was loaded before
 258         ; load the shift constant
 259         lea     rax, [pshufb_shf_table + 16]
 260         sub     rax, arg3
 261         movdqu  xmm0, [rax]
 262
 263         ; shift xmm2 to the left by arg3 bytes
 264         pshufb  xmm2, xmm0
 265
 266         ; shift xmm3 to the right by 16-arg3 bytes
 267         pxor    xmm0, [mask1]
 268         pshufb  xmm3, xmm0
 269         pblendvb        xmm1, xmm2      ;xmm0 is implicit
 270
 271         ; fold 16 Bytes
 272         movdqa  xmm2, xmm1
 273         movdqa  xmm4, xmm3
 274         pclmulqdq       xmm3, xmm6, 0x11
 275         pclmulqdq       xmm4, xmm6, 0x0
 276         pxor    xmm3, xmm4
 277         pxor    xmm3, xmm2
 278
 279 _128_done:
 280         ; compute crc of a 128-bit value
 281         movdqa  xmm6, [rk5]     ; rk5 and rk6 in xmm6
 282         movdqa  xmm0, xmm3
 283
 284         ;64b fold
 285         pclmulqdq       xmm3, xmm6, 0x1
 286         pslldq  xmm0, 8
 287         pxor    xmm3, xmm0
 288
 289         ;32b fold
 290         movdqa  xmm0, xmm3
 291
 292         pand    xmm0, [mask2]
 293
 294         psrldq  xmm3, 12
 295         pclmulqdq       xmm3, xmm6, 0x10
 296         pxor    xmm3, xmm0
 297
 298         ;barrett reduction
 299 _barrett:
 300         movdqa  xmm6, [rk7]     ; rk7 and rk8 in xmm6
 301         movdqa  xmm0, xmm3
 302         pclmulqdq       xmm3, xmm6, 0x01
 303         pslldq  xmm3, 4
 304         pclmulqdq       xmm3, xmm6, 0x11
 305
 306         pslldq  xmm3, 4
 307         pxor    xmm3, xmm0
 308         pextrd  eax, xmm3,1
 309
 310 _cleanup:
 311         ; scale the result back to 16 bits
 312         shr     eax, 16
 313         movdqa  xmm6, [rsp+16*2]
 314         movdqa  xmm7, [rsp+16*3]
 315         add     rsp,16*4+8
 316         ret
 317
 318
 319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 322 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 323
 324 align 16
 325 _less_than_128:
 326
 327         ; check if there is enough buffer to be able to fold 16B at a time
 328         cmp     arg3, 32
 329         jl      _less_than_32
 330         movdqa xmm7, [SHUF_MASK]
 331
 332         ; if there is, load the constants
 333         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 334
 335         movd    xmm0, arg1_low32        ; get the initial crc value
 336         pslldq  xmm0, 12        ; align it to its correct place
 337         movdqu  xmm3, [arg2]    ; load the plaintext
 338         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 339         pxor    xmm3, xmm0
 340
 341
 342         ; update the buffer pointer
 343         add     arg2, 16
 344
 345         ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
 346         sub     arg3, 32
 347
 348         jmp     _16B_reduction_loop
 349
 350
 351 align 16
 352 _less_than_32:
 353         ; mov initial crc to the return value. this is necessary for zero-length buffers.
 354         mov     eax, arg1_low32
 355         test    arg3, arg3
 356         je      _cleanup
 357
 358         movdqa xmm7, [SHUF_MASK]
 359
 360         movd    xmm0, arg1_low32        ; get the initial crc value
 361         pslldq  xmm0, 12                ; align it to its correct place
 362
 363         cmp     arg3, 16
 364         je      _exact_16_left
 365         jl      _less_than_16_left
 366
 367         movdqu  xmm3, [arg2]    ; load the plaintext
 368         pshufb  xmm3, xmm7      ; byte-reflect the plaintext
 369         pxor    xmm3, xmm0      ; xor the initial crc value
 370         add     arg2, 16
 371         sub     arg3, 16
 372         movdqa  xmm6, [rk1]     ; rk1 and rk2 in xmm6
 373         jmp     _get_last_two_xmms
 374
 375
 376 align 16
 377 _less_than_16_left:
 378         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 379
 380         pxor    xmm1, xmm1
 381         mov     r11, rsp
 382         movdqa  [r11], xmm1
 383
 384         cmp     arg3, 4
 385         jl      _only_less_than_4
 386
 387         ;       backup the counter value
 388         mov     r9, arg3
 389         cmp     arg3, 8
 390         jl      _less_than_8_left
 391
 392         ; load 8 Bytes
 393         mov     rax, [arg2]
 394         mov     [r11], rax
 395         add     r11, 8
 396         sub     arg3, 8
 397         add     arg2, 8
 398 _less_than_8_left:
 399
 400         cmp     arg3, 4
 401         jl      _less_than_4_left
 402
 403         ; load 4 Bytes
 404         mov     eax, [arg2]
 405         mov     [r11], eax
 406         add     r11, 4
 407         sub     arg3, 4
 408         add     arg2, 4
 409 _less_than_4_left:
 410
 411         cmp     arg3, 2
 412         jl      _less_than_2_left
 413
 414         ; load 2 Bytes
 415         mov     ax, [arg2]
 416         mov     [r11], ax
 417         add     r11, 2
 418         sub     arg3, 2
 419         add     arg2, 2
 420 _less_than_2_left:
 421         cmp     arg3, 1
 422         jl      _zero_left
 423
 424         ; load 1 Byte
 425         mov     al, [arg2]
 426         mov     [r11], al
 427 _zero_left:
 428         movdqa  xmm3, [rsp]
 429         pshufb  xmm3, xmm7
 430         pxor    xmm3, xmm0      ; xor the initial crc value
 431
 432         ; shl r9, 4
 433         lea     rax, [pshufb_shf_table + 16]
 434         sub     rax, r9
 435         movdqu  xmm0, [rax]
 436         pxor    xmm0, [mask1]
 437
 438         pshufb  xmm3, xmm0
 439         jmp     _128_done
 440
 441 align 16
 442 _exact_16_left:
 443         movdqu  xmm3, [arg2]
 444         pshufb  xmm3, xmm7
 445         pxor    xmm3, xmm0      ; xor the initial crc value
 446
 447         jmp     _128_done
 448
 449 _only_less_than_4:
 450         cmp     arg3, 3
 451         jl      _only_less_than_3
 452
 453         ; load 3 Bytes
 454         mov     al, [arg2]
 455         mov     [r11], al
 456
 457         mov     al, [arg2+1]
 458         mov     [r11+1], al
 459
 460         mov     al, [arg2+2]
 461         mov     [r11+2], al
 462
 463         movdqa  xmm3, [rsp]
 464         pshufb  xmm3, xmm7
 465         pxor    xmm3, xmm0      ; xor the initial crc value
 466
 467         psrldq  xmm3, 5
 468
 469         jmp     _barrett
 470 _only_less_than_3:
 471         cmp     arg3, 2
 472         jl      _only_less_than_2
 473
 474         ; load 2 Bytes
 475         mov     al, [arg2]
 476         mov     [r11], al
 477
 478         mov     al, [arg2+1]
 479         mov     [r11+1], al
 480
 481         movdqa  xmm3, [rsp]
 482         pshufb  xmm3, xmm7
 483         pxor    xmm3, xmm0      ; xor the initial crc value
 484
 485         psrldq  xmm3, 6
 486
 487         jmp     _barrett
 488 _only_less_than_2:
 489
 490         ; load 1 Byte
 491         mov     al, [arg2]
 492         mov     [r11], al
 493
 494         movdqa  xmm3, [rsp]
 495         pshufb  xmm3, xmm7
 496         pxor    xmm3, xmm0      ; xor the initial crc value
 497
 498         psrldq  xmm3, 7
 499
 500         jmp     _barrett
 501
 502 section .data
 503
 504 ; precomputed constants
 505 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
 506 align 16
 507 ; Q = 0x18BB70000
 508 ; rk1 = 2^(32*3) mod Q << 32
 509 ; rk2 = 2^(32*5) mod Q << 32
 510 ; rk3 = 2^(32*15) mod Q << 32
 511 ; rk4 = 2^(32*17) mod Q << 32
 512 ; rk5 = 2^(32*3) mod Q << 32
 513 ; rk6 = 2^(32*2) mod Q << 32
 514 ; rk7 = floor(2^64/Q)
 515 ; rk8 = Q
 516 rk1:
 517 DQ 0x2d56000000000000
 518 rk2:
 519 DQ 0x06df000000000000
 520 rk3:
 521 DQ 0x044c000000000000
 522 rk4:
 523 DQ 0xe658000000000000
 524 rk5:
 525 DQ 0x2d56000000000000
 526 rk6:
 527 DQ 0x1368000000000000
 528 rk7:
 529 DQ 0x00000001f65a57f8
 530 rk8:
 531 DQ 0x000000018bb70000
 532 mask1:
 533 dq 0x8080808080808080, 0x8080808080808080
 534 mask2:
 535 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 536
 537 SHUF_MASK:
 538 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 539
 540 pshufb_shf_table:
 541 ; use these values for shift constants for the pshufb instruction
 542 ; different alignments result in values as shown:
 543 ;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 544 ;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 545 ;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 546 ;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 547 ;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 548 ;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 549 ;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 550 ;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 551 ;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 552 ;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 553 ;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 554 ;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 555 ;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 556 ;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 557 ;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 558 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
 559 dq 0x0706050403020100, 0x000e0d0c0b0a0908
 560
 561 ;;;       func             core, ver, snum
 562 slversion crc16_t10dif_by4, 05,   02,  0016