[ceph.git] / ceph / src / isa-l / igzip / crc32_gzip.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc32_gzip(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
;
;       sample yasm command line:
;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip
;
;       As explained here:
;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
;       CRC-32 checksum is described in RFC 1952
;       Implementing RFC 1952 CRC:
;       http://www.ietf.org/rfc/rfc1952.txt

%include "reg_sizes.asm"

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif

align 16
global  crc32_gzip
crc32_gzip:

        ; unsigned long c = crc ^ 0xffffffffL;
        not     arg1_low32	;


        sub     rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
%endif

        ; check if smaller than 256B
        cmp     arg3, 256

        ; for sizes less than 256, we can't fold 128B at a time...
        jl      _less_than_256


        ; load the initial crc value
        movd    xmm10, arg1_low32      ; initial crc

        ; receive the initial 64B data, xor the initial crc value
        movdqu  xmm0, [arg2+16*0]
        movdqu  xmm1, [arg2+16*1]
        movdqu  xmm2, [arg2+16*2]
        movdqu  xmm3, [arg2+16*3]
        movdqu  xmm4, [arg2+16*4]
        movdqu  xmm5, [arg2+16*5]
        movdqu  xmm6, [arg2+16*6]
        movdqu  xmm7, [arg2+16*7]

        ; XOR the initial_crc value
        pxor    xmm0, xmm10
        movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
                                        ;imm value of pclmulqdq instruction will determine which constant to use
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ; we subtract 256 instead of 128 to save one instruction from the loop
        sub     arg3, 256

        ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
        ; loop will fold 128B at a time until we have 128+y Bytes of buffer


        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

        ; update the buffer pointer
        add     arg2, 128

        movdqu  xmm9, [arg2+16*0]
        movdqu  xmm12, [arg2+16*1]
        movdqa  xmm8, xmm0
        movdqa  xmm13, xmm1
        pclmulqdq       xmm0, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm1, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm0, xmm9
        xorps   xmm0, xmm8
        pxor    xmm1, xmm12
        xorps   xmm1, xmm13

        movdqu  xmm9, [arg2+16*2]
        movdqu  xmm12, [arg2+16*3]
        movdqa  xmm8, xmm2
        movdqa  xmm13, xmm3
        pclmulqdq       xmm2, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm3, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm2, xmm9
        xorps   xmm2, xmm8
        pxor    xmm3, xmm12
        xorps   xmm3, xmm13

        movdqu  xmm9, [arg2+16*4]
        movdqu  xmm12, [arg2+16*5]
        movdqa  xmm8, xmm4
        movdqa  xmm13, xmm5
        pclmulqdq       xmm4, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm5, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm4, xmm9
        xorps   xmm4, xmm8
        pxor    xmm5, xmm12
        xorps   xmm5, xmm13

        movdqu  xmm9, [arg2+16*6]
        movdqu  xmm12, [arg2+16*7]
        movdqa  xmm8, xmm6
        movdqa  xmm13, xmm7
        pclmulqdq       xmm6, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm7, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm6, xmm9
        xorps   xmm6, xmm8
        pxor    xmm7, xmm12
        xorps   xmm7, xmm13

        sub     arg3, 128

        ; check if there is another 128B in the buffer to be able to fold
        jge     _fold_128_B_loop
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


        add     arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7


        ; fold the 8 xmm registers to 1 xmm register with different constants

        movdqa  xmm10, [rk9]
        movdqa  xmm8, xmm0
        pclmulqdq       xmm0, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm0

        movdqa  xmm10, [rk11]
        movdqa  xmm8, xmm1
        pclmulqdq       xmm1, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm1

        movdqa  xmm10, [rk13]
        movdqa  xmm8, xmm2
        pclmulqdq       xmm2, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2

        movdqa  xmm10, [rk15]
        movdqa  xmm8, xmm3
        pclmulqdq       xmm3, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm3

        movdqa  xmm10, [rk17]
        movdqa  xmm8, xmm4
        pclmulqdq       xmm4, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm4

        movdqa  xmm10, [rk19]
        movdqa  xmm8, xmm5
        pclmulqdq       xmm5, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm5

        movdqa  xmm10, [rk1]
        movdqa  xmm8, xmm6
        pclmulqdq       xmm6, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm6


        ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
        ; instead of a cmp instruction, we use the negative flag with the jl instruction
        add     arg3, 128-16
        jl      _final_reduction_for_128

        ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
        ; we can fold 16 bytes at a time if y>=16
        ; continue folding 16B at a time

_16B_reduction_loop:
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        movdqu  xmm0, [arg2]
        pxor    xmm7, xmm0
        add     arg2, 16
        sub     arg3, 16
        ; instead of a cmp instruction, we utilize the flags with the jge instruction
        ; equivalent of: cmp arg3, 16-16
        ; check if there is any more 16B in the buffer to be able to fold
        jge     _16B_reduction_loop

        ;now we have 16+z bytes left to reduce, where 0<= z < 16.
        ;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
        add arg3, 16
        je _128_done

; here we are getting data that is less than 16 bytes.
        ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
        ; after that the registers need to be adjusted.
_get_last_two_xmms:


        movdqa xmm2, xmm7
        movdqu xmm1, [arg2 - 16 + arg3]

        ; get rid of the extra data that was loaded before
        ; load the shift constant
        lea     rax, [pshufb_shf_table]
        add     rax, arg3
        movdqu  xmm0, [rax]


        pshufb  xmm7, xmm0
        pxor    xmm0, [mask3]
        pshufb  xmm2, xmm0

        pblendvb        xmm2, xmm1     ;xmm0 is implicit
        ;;;;;;;;;;
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1

        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2

_128_done:
        ; compute crc of a 128-bit value
        movdqa  xmm10, [rk5]
        movdqa  xmm0, xmm7

        ;64b fold
        pclmulqdq       xmm7, xmm10, 0
        psrldq  xmm0, 8
        pxor    xmm7, xmm0

        ;32b fold
        movdqa  xmm0, xmm7
        pslldq  xmm7, 4
        pclmulqdq       xmm7, xmm10, 0x10

        pxor    xmm7, xmm0


        ;barrett reduction
_barrett:
        pand    xmm7, [mask2]
        movdqa  xmm1, xmm7
        movdqa  xmm2, xmm7
        movdqa  xmm10, [rk7]

        pclmulqdq       xmm7, xmm10, 0
        pxor    xmm7, xmm2
        pand    xmm7, [mask]
        movdqa  xmm2, xmm7
        pclmulqdq       xmm7, xmm10, 0x10
        pxor    xmm7, xmm2
        pxor    xmm7, xmm1
        pextrd  eax, xmm7, 2

_cleanup:
        ; return c ^ 0xffffffffL;
        not     eax


%ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
%endif
        add     rsp, VARIABLE_OFFSET
        ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

        ; check if there is enough buffer to be able to fold 16B at a time
        cmp     arg3, 32
        jl      _less_than_32

        ; if there is, load the constants
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10

        movd    xmm0, arg1_low32       ; get the initial crc value
        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0

        ; update the buffer pointer
        add     arg2, 16

        ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
        sub     arg3, 32

        jmp     _16B_reduction_loop


align 16
_less_than_32:
        ; mov initial crc to the return value. this is necessary for zero-length buffers.
        mov     eax, arg1_low32
        test    arg3, arg3
        je      _cleanup

        movd    xmm0, arg1_low32        ; get the initial crc value

        cmp     arg3, 16
        je      _exact_16_left
        jl      _less_than_16_left

        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0              ; xor the initial crc value
        add     arg2, 16
        sub     arg3, 16
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
        jmp     _get_last_two_xmms


align 16
_less_than_16_left:
        ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

        pxor    xmm1, xmm1
        mov     r11, rsp
        movdqa  [r11], xmm1

        cmp     arg3, 4
        jl      _only_less_than_4

        ;       backup the counter value
        mov     r9, arg3
        cmp     arg3, 8
        jl      _less_than_8_left

        ; load 8 Bytes
        mov     rax, [arg2]
        mov     [r11], rax
        add     r11, 8
        sub     arg3, 8
        add     arg2, 8
_less_than_8_left:

        cmp     arg3, 4
        jl      _less_than_4_left

        ; load 4 Bytes
        mov     eax, [arg2]
        mov     [r11], eax
        add     r11, 4
        sub     arg3, 4
        add     arg2, 4
_less_than_4_left:

        cmp     arg3, 2
        jl      _less_than_2_left

        ; load 2 Bytes
        mov     ax, [arg2]
        mov     [r11], ax
        add     r11, 2
        sub     arg3, 2
        add     arg2, 2
_less_than_2_left:
        cmp     arg3, 1
        jl      _zero_left

        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al

_zero_left:
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value

        lea rax,[pshufb_shf_table]
        movdqu  xmm0, [rax + r9]
        pshufb  xmm7,xmm0


        jmp     _128_done

align 16
_exact_16_left:
        movdqu  xmm7, [arg2]
        pxor    xmm7, xmm0      ; xor the initial crc value

        jmp     _128_done

_only_less_than_4:
        cmp     arg3, 3
        jl      _only_less_than_3

        ; load 3 Bytes
        mov     al, [arg2]
        mov     [r11], al

        mov     al, [arg2+1]
        mov     [r11+1], al

        mov     al, [arg2+2]
        mov     [r11+2], al

        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value

        pslldq  xmm7, 5

        jmp     _barrett
_only_less_than_3:
        cmp     arg3, 2
        jl      _only_less_than_2

        ; load 2 Bytes
        mov     al, [arg2]
        mov     [r11], al

        mov     al, [arg2+1]
        mov     [r11+1], al

        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value

        pslldq  xmm7, 6

        jmp     _barrett
_only_less_than_2:

        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al

        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value

        pslldq  xmm7, 7

        jmp     _barrett

section .data

; precomputed constants
align 16
rk1 :
DQ 0x00000000ccaa009e
rk2 :
DQ 0x00000001751997d0
rk3 :
DQ 0x000000014a7fe880
rk4 :
DQ 0x00000001e88ef372
rk5 :
DQ 0x00000000ccaa009e
rk6 :
DQ 0x0000000163cd6124
rk7 :
DQ 0x00000001f7011640
rk8 :
DQ 0x00000001db710640
rk9 :
DQ 0x00000001d7cfc6ac
rk10 :
DQ 0x00000001ea89367e
rk11 :
DQ 0x000000018cb44e58
rk12 :
DQ 0x00000000df068dc2
rk13 :
DQ 0x00000000ae0b5394
rk14 :
DQ 0x00000001c7569e54
rk15 :
DQ 0x00000001c6e41596
rk16 :
DQ 0x0000000154442bd4
rk17 :
DQ 0x0000000174359406
rk18 :
DQ 0x000000003db1ecdc
rk19 :
DQ 0x000000015a546366
rk20 :
DQ 0x00000000f1da05aa


pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908


mask:
dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq     0x8080808080808080, 0x8080808080808080
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; UINT32 crc32_gzip(
	33	; UINT32 init_crc, //initial CRC value, 32 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; UINT64 len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	; Authors:
	39	; Erdinc Ozturk
	40	; Vinodh Gopal
	41	; James Guilford
	42	;
	43	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	44	; URL: http://download.intel.com/design/intarch/papers/323102.pdf
	45	;
	46	;
	47	; sample yasm command line:
	48	; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip
	49	;
	50	; As explained here:
	51	; http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
	52	; CRC-32 checksum is described in RFC 1952
	53	; Implementing RFC 1952 CRC:
	54	; http://www.ietf.org/rfc/rfc1952.txt
	55
	56	%include "reg_sizes.asm"
	57
	58	[bits 64]
	59	default rel
	60
	61	section .text
	62
	63
	64	%ifidn __OUTPUT_FORMAT__, win64
65	%xdefine arg1 rcx
66	%xdefine arg2 rdx
67	%xdefine arg3 r8
68
69	%xdefine arg1_low32 ecx
70	%else
71	%xdefine arg1 rdi
72	%xdefine arg2 rsi
73	%xdefine arg3 rdx
74
75	%xdefine arg1_low32 edi
76	%endif
77
78	%define TMP 16*0
79	%ifidn __OUTPUT_FORMAT__, win64
80	%define XMM_SAVE 16*2
81	%define VARIABLE_OFFSET 16*10+8
82	%else
83	%define VARIABLE_OFFSET 16*2+8
84	%endif
85
86	align 16
87	global crc32_gzip
88	crc32_gzip:
89
90	; unsigned long c = crc ^ 0xffffffffL;
91	not arg1_low32 ;
92
93
94	sub rsp, VARIABLE_OFFSET
95	%ifidn __OUTPUT_FORMAT__, win64
96	; push the xmm registers into the stack to maintain
97	movdqa [rsp + XMM_SAVE + 16*0], xmm6
98	movdqa [rsp + XMM_SAVE + 16*1], xmm7
99	movdqa [rsp + XMM_SAVE + 16*2], xmm8
100	movdqa [rsp + XMM_SAVE + 16*3], xmm9
101	movdqa [rsp + XMM_SAVE + 16*4], xmm10
102	movdqa [rsp + XMM_SAVE + 16*5], xmm11
103	movdqa [rsp + XMM_SAVE + 16*6], xmm12
104	movdqa [rsp + XMM_SAVE + 16*7], xmm13
105	%endif
106
107	; check if smaller than 256B
108	cmp arg3, 256
109
110	; for sizes less than 256, we can't fold 128B at a time...
111	jl _less_than_256
112
113
114	; load the initial crc value
115	movd xmm10, arg1_low32 ; initial crc
116
117	; receive the initial 64B data, xor the initial crc value
118	movdqu xmm0, [arg2+16*0]
119	movdqu xmm1, [arg2+16*1]
120	movdqu xmm2, [arg2+16*2]
121	movdqu xmm3, [arg2+16*3]
122	movdqu xmm4, [arg2+16*4]
123	movdqu xmm5, [arg2+16*5]
124	movdqu xmm6, [arg2+16*6]
125	movdqu xmm7, [arg2+16*7]
126
127	; XOR the initial_crc value
128	pxor xmm0, xmm10
129	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
130	;imm value of pclmulqdq instruction will determine which constant to use
131	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
132	; we subtract 256 instead of 128 to save one instruction from the loop
133	sub arg3, 256
134
135	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
136	; loop will fold 128B at a time until we have 128+y Bytes of buffer
137
138
139	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
140	_fold_128_B_loop:
141
142	; update the buffer pointer
143	add arg2, 128
144
145	movdqu xmm9, [arg2+16*0]
146	movdqu xmm12, [arg2+16*1]
147	movdqa xmm8, xmm0
148	movdqa xmm13, xmm1
149	pclmulqdq xmm0, xmm10, 0x10
150	pclmulqdq xmm8, xmm10 , 0x1
151	pclmulqdq xmm1, xmm10, 0x10
152	pclmulqdq xmm13, xmm10 , 0x1
153	pxor xmm0, xmm9
154	xorps xmm0, xmm8
155	pxor xmm1, xmm12
156	xorps xmm1, xmm13
157
158	movdqu xmm9, [arg2+16*2]
159	movdqu xmm12, [arg2+16*3]
160	movdqa xmm8, xmm2
161	movdqa xmm13, xmm3
162	pclmulqdq xmm2, xmm10, 0x10
163	pclmulqdq xmm8, xmm10 , 0x1
164	pclmulqdq xmm3, xmm10, 0x10
165	pclmulqdq xmm13, xmm10 , 0x1
166	pxor xmm2, xmm9
167	xorps xmm2, xmm8
168	pxor xmm3, xmm12
169	xorps xmm3, xmm13
170
171	movdqu xmm9, [arg2+16*4]
172	movdqu xmm12, [arg2+16*5]
173	movdqa xmm8, xmm4
174	movdqa xmm13, xmm5
175	pclmulqdq xmm4, xmm10, 0x10
176	pclmulqdq xmm8, xmm10 , 0x1
177	pclmulqdq xmm5, xmm10, 0x10
178	pclmulqdq xmm13, xmm10 , 0x1
179	pxor xmm4, xmm9
180	xorps xmm4, xmm8
181	pxor xmm5, xmm12
182	xorps xmm5, xmm13
183
184	movdqu xmm9, [arg2+16*6]
185	movdqu xmm12, [arg2+16*7]
186	movdqa xmm8, xmm6
187	movdqa xmm13, xmm7
188	pclmulqdq xmm6, xmm10, 0x10
189	pclmulqdq xmm8, xmm10 , 0x1
190	pclmulqdq xmm7, xmm10, 0x10
191	pclmulqdq xmm13, xmm10 , 0x1
192	pxor xmm6, xmm9
193	xorps xmm6, xmm8
194	pxor xmm7, xmm12
195	xorps xmm7, xmm13
196
197	sub arg3, 128
198
199	; check if there is another 128B in the buffer to be able to fold
200	jge _fold_128_B_loop
201	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
202
203
204	add arg2, 128
205	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
206	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
207
208
209	; fold the 8 xmm registers to 1 xmm register with different constants
210
211	movdqa xmm10, [rk9]
212	movdqa xmm8, xmm0
213	pclmulqdq xmm0, xmm10, 0x1
214	pclmulqdq xmm8, xmm10, 0x10
215	pxor xmm7, xmm8
216	xorps xmm7, xmm0
217
218	movdqa xmm10, [rk11]
219	movdqa xmm8, xmm1
220	pclmulqdq xmm1, xmm10, 0x1
221	pclmulqdq xmm8, xmm10, 0x10
222	pxor xmm7, xmm8
223	xorps xmm7, xmm1
224
225	movdqa xmm10, [rk13]
226	movdqa xmm8, xmm2
227	pclmulqdq xmm2, xmm10, 0x1
228	pclmulqdq xmm8, xmm10, 0x10
229	pxor xmm7, xmm8
230	pxor xmm7, xmm2
231
232	movdqa xmm10, [rk15]
233	movdqa xmm8, xmm3
234	pclmulqdq xmm3, xmm10, 0x1
235	pclmulqdq xmm8, xmm10, 0x10
236	pxor xmm7, xmm8
237	xorps xmm7, xmm3
238
239	movdqa xmm10, [rk17]
240	movdqa xmm8, xmm4
241	pclmulqdq xmm4, xmm10, 0x1
242	pclmulqdq xmm8, xmm10, 0x10
243	pxor xmm7, xmm8
244	pxor xmm7, xmm4
245
246	movdqa xmm10, [rk19]
247	movdqa xmm8, xmm5
248	pclmulqdq xmm5, xmm10, 0x1
249	pclmulqdq xmm8, xmm10, 0x10
250	pxor xmm7, xmm8
251	xorps xmm7, xmm5
252
253	movdqa xmm10, [rk1]
254	movdqa xmm8, xmm6
255	pclmulqdq xmm6, xmm10, 0x1
256	pclmulqdq xmm8, xmm10, 0x10
257	pxor xmm7, xmm8
258	pxor xmm7, xmm6
259
260
261	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
262	; instead of a cmp instruction, we use the negative flag with the jl instruction
263	add arg3, 128-16
264	jl _final_reduction_for_128
265
266	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
267	; we can fold 16 bytes at a time if y>=16
268	; continue folding 16B at a time
269
270	_16B_reduction_loop:
271	movdqa xmm8, xmm7
272	pclmulqdq xmm7, xmm10, 0x1
273	pclmulqdq xmm8, xmm10, 0x10
274	pxor xmm7, xmm8
275	movdqu xmm0, [arg2]
276	pxor xmm7, xmm0
277	add arg2, 16
278	sub arg3, 16
279	; instead of a cmp instruction, we utilize the flags with the jge instruction
280	; equivalent of: cmp arg3, 16-16
281	; check if there is any more 16B in the buffer to be able to fold
282	jge _16B_reduction_loop
283
284	;now we have 16+z bytes left to reduce, where 0<= z < 16.
285	;first, we reduce the data in the xmm7 register
286
287
288	_final_reduction_for_128:
289	add arg3, 16
290	je _128_done
291
292	; here we are getting data that is less than 16 bytes.
293	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
294	; after that the registers need to be adjusted.
295	_get_last_two_xmms:
296
297
298	movdqa xmm2, xmm7
299	movdqu xmm1, [arg2 - 16 + arg3]
300
301	; get rid of the extra data that was loaded before
302	; load the shift constant
303	lea rax, [pshufb_shf_table]
304	add rax, arg3
305	movdqu xmm0, [rax]
306
307
308	pshufb xmm7, xmm0
309	pxor xmm0, [mask3]
310	pshufb xmm2, xmm0
311
312	pblendvb xmm2, xmm1 ;xmm0 is implicit
313	;;;;;;;;;;
314	movdqa xmm8, xmm7
315	pclmulqdq xmm7, xmm10, 0x1
316
317	pclmulqdq xmm8, xmm10, 0x10
318	pxor xmm7, xmm8
319	pxor xmm7, xmm2
320
321	_128_done:
322	; compute crc of a 128-bit value
323	movdqa xmm10, [rk5]
324	movdqa xmm0, xmm7
325
326	;64b fold
327	pclmulqdq xmm7, xmm10, 0
328	psrldq xmm0, 8
329	pxor xmm7, xmm0
330
331	;32b fold
332	movdqa xmm0, xmm7
333	pslldq xmm7, 4
334	pclmulqdq xmm7, xmm10, 0x10
335
336	pxor xmm7, xmm0
337
338
339	;barrett reduction
340	_barrett:
341	pand xmm7, [mask2]
342	movdqa xmm1, xmm7
343	movdqa xmm2, xmm7
344	movdqa xmm10, [rk7]
345
346	pclmulqdq xmm7, xmm10, 0
347	pxor xmm7, xmm2
348	pand xmm7, [mask]
349	movdqa xmm2, xmm7
350	pclmulqdq xmm7, xmm10, 0x10
351	pxor xmm7, xmm2
352	pxor xmm7, xmm1
353	pextrd eax, xmm7, 2
354
355	_cleanup:
356	; return c ^ 0xffffffffL;
357	not eax
358
359
360	%ifidn __OUTPUT_FORMAT__, win64
361	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
362	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
363	movdqa xmm8, [rsp + XMM_SAVE + 16*2]
364	movdqa xmm9, [rsp + XMM_SAVE + 16*3]
365	movdqa xmm10, [rsp + XMM_SAVE + 16*4]
366	movdqa xmm11, [rsp + XMM_SAVE + 16*5]
367	movdqa xmm12, [rsp + XMM_SAVE + 16*6]
368	movdqa xmm13, [rsp + XMM_SAVE + 16*7]
369	%endif
370	add rsp, VARIABLE_OFFSET
371	ret
372
373
374	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
375	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
376	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
377	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
378
379	align 16
380	_less_than_256:
381
382	; check if there is enough buffer to be able to fold 16B at a time
383	cmp arg3, 32
384	jl _less_than_32
385
386	; if there is, load the constants
387	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
388
389	movd xmm0, arg1_low32 ; get the initial crc value
390	movdqu xmm7, [arg2] ; load the plaintext
391	pxor xmm7, xmm0
392
393	; update the buffer pointer
394	add arg2, 16
395
396	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
397	sub arg3, 32
398
399	jmp _16B_reduction_loop
400
401
402	align 16
403	_less_than_32:
404	; mov initial crc to the return value. this is necessary for zero-length buffers.
405	mov eax, arg1_low32
406	test arg3, arg3
407	je _cleanup
408
409	movd xmm0, arg1_low32 ; get the initial crc value
410
411	cmp arg3, 16
412	je _exact_16_left
413	jl _less_than_16_left
414
415	movdqu xmm7, [arg2] ; load the plaintext
416	pxor xmm7, xmm0 ; xor the initial crc value
417	add arg2, 16
418	sub arg3, 16
419	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
420	jmp _get_last_two_xmms
421
422
423	align 16
424	_less_than_16_left:
425	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
426
427	pxor xmm1, xmm1
428	mov r11, rsp
429	movdqa [r11], xmm1
430
431	cmp arg3, 4
432	jl _only_less_than_4
433
434	; backup the counter value
435	mov r9, arg3
436	cmp arg3, 8
437	jl _less_than_8_left
438
439	; load 8 Bytes
440	mov rax, [arg2]
441	mov [r11], rax
442	add r11, 8
443	sub arg3, 8
444	add arg2, 8
445	_less_than_8_left:
446
447	cmp arg3, 4
448	jl _less_than_4_left
449
450	; load 4 Bytes
451	mov eax, [arg2]
452	mov [r11], eax
453	add r11, 4
454	sub arg3, 4
455	add arg2, 4
456	_less_than_4_left:
457
458	cmp arg3, 2
459	jl _less_than_2_left
460
461	; load 2 Bytes
462	mov ax, [arg2]
463	mov [r11], ax
464	add r11, 2
465	sub arg3, 2
466	add arg2, 2
467	_less_than_2_left:
468	cmp arg3, 1
469	jl _zero_left
470
471	; load 1 Byte
472	mov al, [arg2]
473	mov [r11], al
474
475	_zero_left:
476	movdqa xmm7, [rsp]
477	pxor xmm7, xmm0 ; xor the initial crc value
478
479	lea rax,[pshufb_shf_table]
480	movdqu xmm0, [rax + r9]
481	pshufb xmm7,xmm0
482
483
484
485	jmp _128_done
486
487	align 16
488	_exact_16_left:
489	movdqu xmm7, [arg2]
490	pxor xmm7, xmm0 ; xor the initial crc value
491
492	jmp _128_done
493
494	_only_less_than_4:
495	cmp arg3, 3
496	jl _only_less_than_3
497
498	; load 3 Bytes
499	mov al, [arg2]
500	mov [r11], al
501
502	mov al, [arg2+1]
503	mov [r11+1], al
504
505	mov al, [arg2+2]
506	mov [r11+2], al
507
508	movdqa xmm7, [rsp]
509	pxor xmm7, xmm0 ; xor the initial crc value
510
511	pslldq xmm7, 5
512
513	jmp _barrett
514	_only_less_than_3:
515	cmp arg3, 2
516	jl _only_less_than_2
517
518	; load 2 Bytes
519	mov al, [arg2]
520	mov [r11], al
521
522	mov al, [arg2+1]
523	mov [r11+1], al
524
525	movdqa xmm7, [rsp]
526	pxor xmm7, xmm0 ; xor the initial crc value
527
528	pslldq xmm7, 6
529
530	jmp _barrett
531	_only_less_than_2:
532
533	; load 1 Byte
534	mov al, [arg2]
535	mov [r11], al
536
537	movdqa xmm7, [rsp]
538	pxor xmm7, xmm0 ; xor the initial crc value
539
540	pslldq xmm7, 7
541
542	jmp _barrett
543
544	section .data
545
546	; precomputed constants
547	align 16
548	rk1 :
549	DQ 0x00000000ccaa009e
550	rk2 :
551	DQ 0x00000001751997d0
552	rk3 :
553	DQ 0x000000014a7fe880
554	rk4 :
555	DQ 0x00000001e88ef372
556	rk5 :
557	DQ 0x00000000ccaa009e
558	rk6 :
559	DQ 0x0000000163cd6124
560	rk7 :
561	DQ 0x00000001f7011640
562	rk8 :
563	DQ 0x00000001db710640
564	rk9 :
565	DQ 0x00000001d7cfc6ac
566	rk10 :
567	DQ 0x00000001ea89367e
568	rk11 :
569	DQ 0x000000018cb44e58
570	rk12 :
571	DQ 0x00000000df068dc2
572	rk13 :
573	DQ 0x00000000ae0b5394
574	rk14 :
575	DQ 0x00000001c7569e54
576	rk15 :
577	DQ 0x00000001c6e41596
578	rk16 :
579	DQ 0x0000000154442bd4
580	rk17 :
581	DQ 0x0000000174359406
582	rk18 :
583	DQ 0x000000003db1ecdc
584	rk19 :
585	DQ 0x000000015a546366
586	rk20 :
587	DQ 0x00000000f1da05aa
588
589
590	pshufb_shf_table:
591	; use these values for shift constants for the pshufb instruction
592	; different alignments result in values as shown:
593	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
594	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
595	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
596	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
597	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
598	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
599	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
600	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
601	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
602	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
603	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
604	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
605	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
606	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
607	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
608	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
609	dq 0x0706050403020100, 0x000e0d0c0b0a0908
610
611
612	mask:
613	dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
614	mask2:
615	dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
616	mask3:
617	dq 0x8080808080808080, 0x8080808080808080