ceph/src/isa-l/crc/crc32_ieee_by4.asm

   1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
   2 ;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
   3 ;
   4 ;  Redistribution and use in source and binary forms, with or without
   5 ;  modification, are permitted provided that the following conditions
   6 ;  are met:
   7 ;    * Redistributions of source code must retain the above copyright
   8 ;      notice, this list of conditions and the following disclaimer.
   9 ;    * Redistributions in binary form must reproduce the above copyright
  10 ;      notice, this list of conditions and the following disclaimer in
  11 ;      the documentation and/or other materials provided with the
  12 ;      distribution.
  13 ;    * Neither the name of Intel Corporation nor the names of its
  14 ;      contributors may be used to endorse or promote products derived
  15 ;      from this software without specific prior written permission.
  16 ;
  17 ;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 ;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 ;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 ;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 ;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 ;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 ;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 ;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 ;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 ;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 ;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  29 ;
  30 ;       Function API:
  31 ;       UINT32 crc32_ieee_by4(
  32 ;               UINT32 init_crc, //initial CRC value, 32 bits
  33 ;               const unsigned char *buf, //buffer pointer to calculate CRC on
  34 ;               UINT64 len //buffer length in bytes (64-bit data)
  35 ;       );
  36 ;
  37 ;       Authors:
  38 ;               Erdinc Ozturk
  39 ;               Vinodh Gopal
  40 ;               James Guilford
  41 ;
  42 ;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  43 ;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
  44 ;
  45
  46 %include "reg_sizes.asm"
  47
  48 [bits 64]
  49 default rel
  50
  51 section .text
  52
  53 %ifidn __OUTPUT_FORMAT__, win64
  54         %xdefine        arg1 rcx
  55         %xdefine        arg2 rdx
  56         %xdefine        arg3 r8
  57
  58         %xdefine        arg1_low32 ecx
  59 %else
  60         %xdefine        arg1 rdi
  61         %xdefine        arg2 rsi
  62         %xdefine        arg3 rdx
  63
  64         %xdefine        arg1_low32 edi
  65 %endif
  66
  67 %ifidn __OUTPUT_FORMAT__, win64
  68         %define XMM_SAVE 16*2
  69         %define VARIABLE_OFFSET 16*4+8
  70 %else
  71         %define VARIABLE_OFFSET 16*2+8
  72 %endif
  73
  74 align 16
  75 global  crc32_ieee_by4:function
  76 crc32_ieee_by4:
  77
  78         not arg1_low32
  79
  80         sub rsp,VARIABLE_OFFSET
  81
  82 %ifidn __OUTPUT_FORMAT__, win64
  83         ; push the xmm registers into the stack to maintain
  84         movdqa [rsp + XMM_SAVE + 16*0],xmm6
  85         movdqa [rsp + XMM_SAVE + 16*1],xmm7
  86 %endif
  87
  88         ; check if smaller than 128B
  89         cmp arg3, 128
  90         jl _less_than_128
  91
  92
  93
  94         ; load the initial crc value
  95         movd xmm6, arg1_low32                                   ; initial crc
  96         ; crc value does not need to be byte-reflected, but it needs to be
  97         ; moved to the high part of the register.
  98         ; because data will be byte-reflected and will align with initial
  99         ; crc at correct place.
 100         pslldq xmm6, 12
 101
 102
 103
 104         movdqa xmm7, [SHUF_MASK]
 105         ; receive the initial 64B data, xor the initial crc value
 106         movdqu xmm0, [arg2]
 107         movdqu xmm1, [arg2+16]
 108         movdqu xmm2, [arg2+32]
 109         movdqu xmm3, [arg2+48]
 110
 111
 112
 113         pshufb xmm0, xmm7
 114         ; XOR the initial_crc value
 115         pxor xmm0, xmm6
 116         pshufb xmm1, xmm7
 117         pshufb xmm2, xmm7
 118         pshufb xmm3, xmm7
 119
 120         movdqa xmm6, [rk3]      ; k3=2^480 mod POLY << 32
 121         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 122         ;we subtract 128 instead of 64 to save one instruction from the loop
 123         sub     arg3, 128
 124
 125         ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
 126         ; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
 127         ;  have 64+y Bytes of buffer
 128
 129
 130         ; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
 131 _fold_64_B_loop:
 132
 133         ;update the buffer pointer
 134         add arg2, 64
 135
 136         movdqa xmm4, xmm0
 137         movdqa xmm5, xmm1
 138
 139         pclmulqdq xmm0, xmm6 , 0x11
 140         pclmulqdq xmm1, xmm6 , 0x11
 141
 142         pclmulqdq xmm4, xmm6, 0x0
 143         pclmulqdq xmm5, xmm6, 0x0
 144
 145         pxor xmm0, xmm4
 146         pxor xmm1, xmm5
 147
 148         movdqa xmm4, xmm2
 149         movdqa xmm5, xmm3
 150
 151         pclmulqdq xmm2, xmm6, 0x11
 152         pclmulqdq xmm3, xmm6, 0x11
 153
 154         pclmulqdq xmm4, xmm6, 0x0
 155         pclmulqdq xmm5, xmm6, 0x0
 156
 157         pxor xmm2, xmm4
 158         pxor xmm3, xmm5
 159
 160         movdqu xmm4, [arg2]
 161         movdqu xmm5, [arg2+16]
 162         pshufb xmm4, xmm7
 163         pshufb xmm5, xmm7
 164         pxor xmm0, xmm4
 165         pxor xmm1, xmm5
 166
 167         movdqu xmm4, [arg2+32]
 168         movdqu xmm5, [arg2+48]
 169         pshufb xmm4, xmm7
 170         pshufb xmm5, xmm7
 171
 172         pxor xmm2, xmm4
 173         pxor xmm3, xmm5
 174
 175         sub     arg3, 64
 176
 177         ; check if there is another 64B in the buffer to be able to fold
 178         jge     _fold_64_B_loop
 179         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 180
 181
 182         add arg2, 64
 183         ;at this point, the arg2 is pointing at the last y Bytes of the buffer
 184         ; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
 185
 186
 187         movdqa xmm6, [rk1]              ;k1
 188
 189         ; fold the 4 xmm registers to 1 xmm register with different constants
 190         movdqa xmm4, xmm0
 191         pclmulqdq xmm0, xmm6, 0x11
 192         pclmulqdq xmm4, xmm6, 0x0
 193         pxor xmm1, xmm4
 194         xorps xmm1, xmm0
 195
 196         movdqa xmm4, xmm1
 197         pclmulqdq xmm1, xmm6, 0x11
 198         pclmulqdq xmm4, xmm6, 0x0
 199         pxor xmm2, xmm4
 200         xorps xmm2, xmm1
 201
 202         movdqa xmm4, xmm2
 203         pclmulqdq xmm2, xmm6, 0x11
 204         pclmulqdq xmm4, xmm6, 0x0
 205         pxor xmm3, xmm4
 206         pxor xmm3, xmm2
 207
 208
 209         ;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
 210         ; instead of a cmp instruction, we use the negative flag with the jl instruction
 211         add arg3, 64-16
 212         jl _final_reduction_for_128
 213
 214 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
 215 ; we can fold 16 bytes at a time if y>=16
 216 ; continue folding 16B at a time
 217
 218 _16B_reduction_loop:
 219         movdqa xmm4, xmm3
 220         pclmulqdq xmm3, xmm6, 0x11
 221         pclmulqdq xmm4, xmm6, 0x0
 222         pxor xmm3, xmm4
 223         movdqu xmm0, [arg2]
 224         pshufb xmm0, xmm7
 225         pxor xmm3, xmm0
 226         add arg2, 16
 227         sub arg3, 16
 228         ; instead of a cmp instruction, we utilize the flags with the jge instruction
 229         ; equivalent of: cmp arg3, 16-16
 230         ; check if there is any more 16B in the buffer to be able to fold
 231         jge _16B_reduction_loop
 232
 233         ;now we have 16+z bytes left to reduce, where 0<= z < 16.
 234         ;first, we reduce the data in the xmm3 register
 235
 236
 237
 238 _final_reduction_for_128:
 239         ; check if any more data to fold. If not, compute the CRC of the final 128 bits
 240         add arg3, 16
 241         je _128_done
 242
 243         ; here we are getting data that is less than 16 bytes.
 244         ; since we know that there was data before the pointer, we can offset
 245         ; the input pointer before the actual point, to receive exactly 16 bytes.
 246         ; after that the registers need to be adjusted.
 247 _get_last_two_xmms:
 248         movdqa xmm2, xmm3
 249
 250         movdqu xmm1, [arg2 - 16 + arg3]
 251         pshufb xmm1, xmm7
 252
 253         shl arg3, 4
 254         lea rax, [pshufb_shf_table + 15*16]
 255         sub rax, arg3
 256         movdqu xmm0, [rax]
 257
 258         pshufb  xmm2, xmm0
 259
 260         pxor xmm0, [mask3]
 261
 262         pshufb  xmm3, xmm0
 263
 264         pblendvb xmm1, xmm2     ;xmm0 is implicit
 265
 266         movdqa xmm2, xmm1
 267
 268         movdqa xmm4, xmm3
 269         pclmulqdq xmm3, xmm6, 0x11
 270
 271         pclmulqdq xmm4, xmm6, 0x0
 272         pxor xmm3, xmm4
 273         pxor xmm3, xmm2
 274
 275 _128_done:
 276
 277         movdqa xmm6, [rk5]
 278         movdqa xmm0, xmm3
 279
 280         ;64b fold
 281         pclmulqdq xmm3, xmm6, 0x1
 282         pslldq xmm0, 8
 283         pxor xmm3, xmm0
 284
 285         ;32b fold
 286         movdqa xmm0, xmm3
 287
 288         pand xmm0, [mask4]
 289
 290         psrldq xmm3, 12
 291         pclmulqdq xmm3, xmm6, 0x10
 292         pxor xmm3, xmm0
 293
 294         ;barrett reduction
 295 _barrett:
 296         movdqa xmm6, [rk7]
 297         movdqa xmm0, xmm3
 298         pclmulqdq xmm3, xmm6, 0x01
 299         pslldq xmm3, 4
 300         pclmulqdq xmm3, xmm6, 0x11
 301
 302         pslldq xmm3, 4
 303         pxor xmm3, xmm0
 304         pextrd eax, xmm3,1
 305
 306 _cleanup:
 307         not eax
 308 %ifidn __OUTPUT_FORMAT__, win64
 309         movdqa xmm6, [rsp + XMM_SAVE + 16*0]
 310         movdqa xmm7, [rsp + XMM_SAVE + 16*1]
 311 %endif
 312         add rsp,VARIABLE_OFFSET
 313
 314
 315         ret
 316
 317
 318
 319
 320
 321
 322
 323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 324 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 325 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 326 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 327
 328 align 16
 329 _less_than_128:
 330
 331         ;check if there is enough buffer to be able to fold 16B at a time
 332         cmp arg3, 32
 333         jl _less_than_32
 334         movdqa xmm7, [SHUF_MASK]
 335
 336         ;if there is, load the constants
 337         movdqa xmm6, [rk1]              ;k1
 338
 339         movd xmm0, arg1_low32
 340         pslldq xmm0, 12
 341         movdqu xmm3, [arg2]
 342         pshufb xmm3, xmm7
 343         pxor xmm3, xmm0
 344
 345
 346         ;update the buffer pointer
 347         add arg2, 16
 348
 349         ;update the counter. subtract 32 instead of 16 to save one instruction from the loop
 350         sub arg3, 32
 351
 352         jmp _16B_reduction_loop
 353
 354
 355 align 16
 356 _less_than_32:
 357         mov eax, arg1_low32
 358         test arg3, arg3
 359         je _cleanup
 360
 361         movdqa xmm7, [SHUF_MASK]
 362
 363         movd xmm0, arg1_low32
 364         pslldq xmm0, 12
 365
 366         cmp arg3, 16
 367         je _exact_16_left
 368         jl _less_than_16_left
 369         movd xmm0, arg1_low32
 370         pslldq xmm0, 12
 371         movdqu xmm3, [arg2]
 372         pshufb xmm3, xmm7
 373         pxor xmm3, xmm0
 374         add arg2, 16
 375         sub arg3, 16
 376         movdqa xmm6, [rk1]              ;k1
 377         jmp _get_last_two_xmms
 378
 379
 380 align 16
 381 _less_than_16_left:
 382         ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
 383
 384         pxor xmm1, xmm1
 385         mov r11, rsp
 386         movdqa [r11], xmm1
 387
 388
 389
 390         cmp arg3, 4
 391         jl _only_less_than_4
 392
 393         mov r9, arg3
 394
 395
 396         cmp arg3, 8
 397         jl _less_than_8_left
 398         mov rax, [arg2]
 399         mov [r11], rax
 400         add r11, 8
 401         sub arg3, 8
 402         add arg2, 8
 403 _less_than_8_left:
 404
 405         cmp arg3, 4
 406         jl _less_than_4_left
 407         mov eax, [arg2]
 408         mov [r11], eax
 409         add r11, 4
 410         sub arg3, 4
 411         add arg2, 4
 412 _less_than_4_left:
 413
 414         cmp arg3, 2
 415         jl _less_than_2_left
 416         mov ax, [arg2]
 417         mov [r11], ax
 418         add r11, 2
 419         sub arg3, 2
 420         add arg2, 2
 421 _less_than_2_left:
 422         cmp arg3, 1
 423         jl _zero_left
 424
 425         mov al, [arg2]
 426         mov [r11], al
 427
 428 _zero_left:
 429         movdqa xmm3, [rsp]
 430         pshufb xmm3, xmm7
 431         pxor xmm3, xmm0
 432
 433         shl r9, 4
 434         lea rax, [pshufb_shf_table + 15*16]
 435         sub rax, r9
 436         movdqu xmm0, [rax]
 437         pxor xmm0, [mask3]
 438
 439         pshufb xmm3, xmm0
 440         jmp _128_done
 441
 442 align 16
 443 _exact_16_left:
 444         movdqu xmm3, [arg2]
 445         pshufb xmm3, xmm7
 446         pxor xmm3, xmm0
 447
 448         jmp _128_done
 449
 450 _only_less_than_4:
 451         cmp arg3, 3
 452         jl _only_less_than_3
 453         mov al, [arg2]
 454         mov [r11], al
 455
 456         mov al, [arg2+1]
 457         mov [r11+1], al
 458
 459         mov al, [arg2+2]
 460         mov [r11+2], al
 461
 462         movdqa xmm3, [rsp]
 463         pshufb xmm3, xmm7
 464         pxor xmm3, xmm0
 465
 466         psrldq xmm3, 5
 467
 468         jmp _barrett
 469 _only_less_than_3:
 470         cmp arg3, 2
 471         jl _only_less_than_2
 472         mov al, [arg2]
 473         mov [r11], al
 474
 475         mov al, [arg2+1]
 476         mov [r11+1], al
 477
 478         movdqa xmm3, [rsp]
 479         pshufb xmm3, xmm7
 480         pxor xmm3, xmm0
 481
 482         psrldq xmm3, 6
 483
 484         jmp _barrett
 485 _only_less_than_2:
 486         mov al, [arg2]
 487         mov [r11], al
 488
 489         movdqa xmm3, [rsp]
 490         pshufb xmm3, xmm7
 491         pxor xmm3, xmm0
 492
 493         psrldq xmm3, 7
 494
 495         jmp _barrett
 496 ; precomputed constants
 497 section .data
 498
 499 align 16
 500 rk1:
 501 DQ 0xf200aa6600000000
 502 rk2:
 503 DQ 0x17d3315d00000000
 504 rk3:
 505 DQ 0xd3504ec700000000
 506 rk4:
 507 DQ 0x57a8445500000000
 508 rk5:
 509 DQ 0xf200aa6600000000
 510 rk6:
 511 DQ 0x490d678d00000000
 512 rk7:
 513 DQ 0x0000000104d101df
 514 rk8:
 515 DQ 0x0000000104c11db7
 516 mask:
 517 dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
 518 mask2:
 519 dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
 520 mask3:
 521 dq 0x8080808080808080, 0x8080808080808080
 522 mask4:
 523 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
 524         align 32
 525 pshufb_shf_table:
 526
 527         dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
 528
 529         dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
 530
 531         dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
 532
 533         dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
 534
 535         dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
 536
 537         dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
 538
 539         dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
 540
 541         dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
 542
 543         dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
 544
 545         dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
 546
 547         dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
 548
 549         dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
 550
 551         dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
 552
 553         dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
 554
 555         dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
 556
 557
 558 SHUF_MASK       dq 0x08090A0B0C0D0E0F, 0x0001020304050607
 559
 560 ;;;       func             core, ver, snum
 561 slversion crc32_ieee_by4, 05,   02,  0017