[ceph.git] / ceph / src / isa-l / crc / crc64_jones_refl_by8.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       uint64_t crc64_jones_refl_by8(
;               uint64_t init_crc, //initial CRC value, 64 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               uint64_t len //buffer length in bytes (64-bit data)
;       );
;
%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif


align 16
global crc64_jones_refl_by8:ISAL_SYM_TYPE_FUNCTION
crc64_jones_refl_by8:
        ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
	not arg1
        sub     rsp, VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
%endif

        ; check if smaller than 256B
        cmp     arg3, 256

        ; for sizes less than 256, we can't fold 128B at a time...
        jl      _less_than_256


        ; load the initial crc value
        movq    xmm10, arg1      ; initial crc
      ; receive the initial 128B data, xor the initial crc value
        movdqu  xmm0, [arg2+16*0]
        movdqu  xmm1, [arg2+16*1]
        movdqu  xmm2, [arg2+16*2]
        movdqu  xmm3, [arg2+16*3]
        movdqu  xmm4, [arg2+16*4]
        movdqu  xmm5, [arg2+16*5]
        movdqu  xmm6, [arg2+16*6]
        movdqu  xmm7, [arg2+16*7]

        ; XOR the initial_crc value
        pxor    xmm0, xmm10
        movdqa  xmm10, [rk3]    ;xmm10 has rk3 and rk4
                                        ;imm value of pclmulqdq instruction will determine which constant to use
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ; we subtract 256 instead of 128 to save one instruction from the loop
        sub     arg3, 256

        ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
        ; loop will fold 128B at a time until we have 128+y Bytes of buffer


        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

        ; update the buffer pointer
        add     arg2, 128

	prefetchnta [arg2+fetch_dist+0]
        movdqu  xmm9, [arg2+16*0]
        movdqu  xmm12, [arg2+16*1]
        movdqa  xmm8, xmm0
        movdqa  xmm13, xmm1
        pclmulqdq       xmm0, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm1, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm0, xmm9
        xorps   xmm0, xmm8
        pxor    xmm1, xmm12
        xorps   xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
        movdqu  xmm9, [arg2+16*2]
        movdqu  xmm12, [arg2+16*3]
        movdqa  xmm8, xmm2
        movdqa  xmm13, xmm3
        pclmulqdq       xmm2, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm3, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm2, xmm9
        xorps   xmm2, xmm8
        pxor    xmm3, xmm12
        xorps   xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
        movdqu  xmm9, [arg2+16*4]
        movdqu  xmm12, [arg2+16*5]
        movdqa  xmm8, xmm4
        movdqa  xmm13, xmm5
        pclmulqdq       xmm4, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm5, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm4, xmm9
        xorps   xmm4, xmm8
        pxor    xmm5, xmm12
        xorps   xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
        movdqu  xmm9, [arg2+16*6]
        movdqu  xmm12, [arg2+16*7]
        movdqa  xmm8, xmm6
        movdqa  xmm13, xmm7
        pclmulqdq       xmm6, xmm10, 0x10
        pclmulqdq       xmm8, xmm10 , 0x1
        pclmulqdq       xmm7, xmm10, 0x10
        pclmulqdq       xmm13, xmm10 , 0x1
        pxor    xmm6, xmm9
        xorps   xmm6, xmm8
        pxor    xmm7, xmm12
        xorps   xmm7, xmm13

        sub     arg3, 128

        ; check if there is another 128B in the buffer to be able to fold
        jge     _fold_128_B_loop
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        add     arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7


        ; fold the 8 xmm registers to 1 xmm register with different constants
	; xmm0 to xmm7
        movdqa  xmm10, [rk9]
        movdqa  xmm8, xmm0
        pclmulqdq       xmm0, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm0
        ;xmm1 to xmm7
        movdqa  xmm10, [rk11]
        movdqa  xmm8, xmm1
        pclmulqdq       xmm1, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm1

        movdqa  xmm10, [rk13]
        movdqa  xmm8, xmm2
        pclmulqdq       xmm2, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2

        movdqa  xmm10, [rk15]
        movdqa  xmm8, xmm3
        pclmulqdq       xmm3, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm3

        movdqa  xmm10, [rk17]
        movdqa  xmm8, xmm4
        pclmulqdq       xmm4, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm4

        movdqa  xmm10, [rk19]
        movdqa  xmm8, xmm5
        pclmulqdq       xmm5, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        xorps   xmm7, xmm5
	; xmm6 to xmm7
        movdqa  xmm10, [rk1]
        movdqa  xmm8, xmm6
        pclmulqdq       xmm6, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm6


        ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
        ; instead of a cmp instruction, we use the negative flag with the jl instruction
        add     arg3, 128-16
        jl      _final_reduction_for_128

        ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
        ; we can fold 16 bytes at a time if y>=16
        ; continue folding 16B at a time

_16B_reduction_loop:
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1
        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        movdqu  xmm0, [arg2]
        pxor    xmm7, xmm0
        add     arg2, 16
        sub     arg3, 16
        ; instead of a cmp instruction, we utilize the flags with the jge instruction
        ; equivalent of: cmp arg3, 16-16
        ; check if there is any more 16B in the buffer to be able to fold
        jge     _16B_reduction_loop

        ;now we have 16+z bytes left to reduce, where 0<= z < 16.
        ;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
        add arg3, 16
        je _128_done
  ; here we are getting data that is less than 16 bytes.
        ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
        ; after that the registers need to be adjusted.
_get_last_two_xmms:


        movdqa xmm2, xmm7
        movdqu xmm1, [arg2 - 16 + arg3]

        ; get rid of the extra data that was loaded before
        ; load the shift constant
        lea     rax, [pshufb_shf_table]
        add     rax, arg3
        movdqu  xmm0, [rax]


        pshufb  xmm7, xmm0
        pxor    xmm0, [mask3]
        pshufb  xmm2, xmm0

        pblendvb        xmm2, xmm1     ;xmm0 is implicit
        ;;;;;;;;;;
        movdqa  xmm8, xmm7
        pclmulqdq       xmm7, xmm10, 0x1

        pclmulqdq       xmm8, xmm10, 0x10
        pxor    xmm7, xmm8
        pxor    xmm7, xmm2

_128_done:
        ; compute crc of a 128-bit value
        movdqa  xmm10, [rk5]
        movdqa  xmm0, xmm7

        ;64b fold
        pclmulqdq       xmm7, xmm10, 0
        psrldq  xmm0, 8
        pxor    xmm7, xmm0

        ;barrett reduction
_barrett:
        movdqa  xmm1, xmm7
        movdqa  xmm10, [rk7]

        pclmulqdq       xmm7, xmm10, 0
        movdqa  xmm2, xmm7
        pclmulqdq       xmm7, xmm10, 0x10
        pslldq  xmm2, 8
        pxor    xmm7, xmm2
        pxor    xmm7, xmm1
        pextrq  rax, xmm7, 1

_cleanup:
        ; return c ^ 0xffffffff, ffffffffL;
        not     rax


%ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
%endif
        add     rsp, VARIABLE_OFFSET
        ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

        ; check if there is enough buffer to be able to fold 16B at a time
        cmp     arg3, 32
        jl      _less_than_32

        ; if there is, load the constants
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10

        movq    xmm0, arg1       ; get the initial crc value
        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0

        ; update the buffer pointer
        add     arg2, 16

        ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
        sub     arg3, 32

        jmp     _16B_reduction_loop

align 16
_less_than_32:
        ; mov initial crc to the return value. this is necessary for zero-length buffers.
        mov     rax, arg1
        test    arg3, arg3
        je      _cleanup

        movq    xmm0, arg1       ; get the initial crc value

        cmp     arg3, 16
        je      _exact_16_left
        jl      _less_than_16_left

        movdqu  xmm7, [arg2]            ; load the plaintext
        pxor    xmm7, xmm0              ; xor the initial crc value
        add     arg2, 16
        sub     arg3, 16
        movdqa  xmm10, [rk1]    ; rk1 and rk2 in xmm10
        jmp     _get_last_two_xmms


align 16
_less_than_16_left:
        ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

        pxor    xmm1, xmm1
        mov     r11, rsp
        movdqa  [r11], xmm1

        ;       backup the counter value
        mov     r9, arg3
        cmp     arg3, 8
        jl      _less_than_8_left

        ; load 8 Bytes
        mov     rax, [arg2]
        mov     [r11], rax
        add     r11, 8
        sub     arg3, 8
        add     arg2, 8
_less_than_8_left:

        cmp     arg3, 4
        jl      _less_than_4_left

        ; load 4 Bytes
        mov     eax, [arg2]
        mov     [r11], eax
        add     r11, 4
        sub     arg3, 4
        add     arg2, 4
_less_than_4_left:

        cmp     arg3, 2
        jl      _less_than_2_left

        ; load 2 Bytes
        mov     ax, [arg2]
        mov     [r11], ax
        add     r11, 2
        sub     arg3, 2
        add     arg2, 2
_less_than_2_left:
        cmp     arg3, 1
        jl      _zero_left

        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al

_zero_left:
        movdqa  xmm7, [rsp]
        pxor    xmm7, xmm0      ; xor the initial crc value

        lea rax,[pshufb_shf_table]

	cmp     r9, 8
        jl      _end_1to7

_end_8to15:
        movdqu  xmm0, [rax + r9]
        pshufb  xmm7,xmm0
        jmp     _128_done

_end_1to7:
	; Left shift (8-length) bytes in XMM
        movdqu  xmm0, [rax + r9 + 8]
        pshufb  xmm7,xmm0

        jmp     _barrett

align 16
_exact_16_left:
        movdqu  xmm7, [arg2]
        pxor    xmm7, xmm0      ; xor the initial crc value

        jmp     _128_done

section .data

; precomputed constants
align 16
; rk7 = floor(2^128/Q)
; rk8 = Q
rk1:
DQ 0x381d0015c96f4444
rk2:
DQ 0xd9d7be7d505da32c
rk3:
DQ 0x768361524d29ed0b
rk4:
DQ 0xcc26fa7c57f8054c
rk5:
DQ 0x381d0015c96f4444
rk6:
DQ 0x0000000000000000
rk7:
DQ 0x3e6cfa329aef9f77
rk8:
DQ 0x2b5926535897936a
rk9:
DQ 0x5bc94ba8e2087636
rk10:
DQ 0x6cf09c8f37710b75
rk11:
DQ 0x3885fd59e440d95a
rk12:
DQ 0xbccba3936411fb7e
rk13:
DQ 0xe4dd0d81cbfce585
rk14:
DQ 0xb715e37b96ed8633
rk15:
DQ 0xf49784a634f014e4
rk16:
DQ 0xaf86efb16d9ab4fb
rk17:
DQ 0x7b3211a760160db8
rk18:
DQ 0xa062b2319d66692f
rk19:
DQ 0xef3d1d18ed889ed2
rk20:
DQ 0x6ba4d760ab38201e

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908


mask:
dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq     0x8080808080808080, 0x8080808080808080

;;;       func        core, ver, snum
slversion crc64_jones_refl_by8, 01,   00,  0029
Commit	Line	Data
224ce89b WB	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; uint64_t crc64_jones_refl_by8(
	33	; uint64_t init_crc, //initial CRC value, 64 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; uint64_t len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	%include "reg_sizes.asm"
	39
	40	%define fetch_dist 1024
	41
	42	[bits 64]
	43	default rel
	44
	45	section .text
	46
	47
	48	%ifidn __OUTPUT_FORMAT__, win64
	49	%xdefine arg1 rcx
	50	%xdefine arg2 rdx
	51	%xdefine arg3 r8
	52	%else
	53	%xdefine arg1 rdi
	54	%xdefine arg2 rsi
	55	%xdefine arg3 rdx
	56	%endif
	57
	58	%define TMP 16*0
	59	%ifidn __OUTPUT_FORMAT__, win64
	60	%define XMM_SAVE 16*2
	61	%define VARIABLE_OFFSET 16*10+8
	62	%else
	63	%define VARIABLE_OFFSET 16*2+8
	64	%endif
65
66
67	align 16
f91f0fd5	68	global crc64_jones_refl_by8:ISAL_SYM_TYPE_FUNCTION
224ce89b WB	69	crc64_jones_refl_by8:
	70	; uint64_t c = crc ^ 0xffffffff,ffffffffL;
	71	not arg1
	72	sub rsp, VARIABLE_OFFSET
	73
	74	%ifidn __OUTPUT_FORMAT__, win64
	75	; push the xmm registers into the stack to maintain
	76	movdqa [rsp + XMM_SAVE + 16*0], xmm6
	77	movdqa [rsp + XMM_SAVE + 16*1], xmm7
	78	movdqa [rsp + XMM_SAVE + 16*2], xmm8
	79	movdqa [rsp + XMM_SAVE + 16*3], xmm9
	80	movdqa [rsp + XMM_SAVE + 16*4], xmm10
	81	movdqa [rsp + XMM_SAVE + 16*5], xmm11
	82	movdqa [rsp + XMM_SAVE + 16*6], xmm12
	83	movdqa [rsp + XMM_SAVE + 16*7], xmm13
	84	%endif
	85
	86	; check if smaller than 256B
	87	cmp arg3, 256
	88
	89	; for sizes less than 256, we can't fold 128B at a time...
	90	jl _less_than_256
	91
	92
	93	; load the initial crc value
	94	movq xmm10, arg1 ; initial crc
	95	; receive the initial 128B data, xor the initial crc value
	96	movdqu xmm0, [arg2+16*0]
	97	movdqu xmm1, [arg2+16*1]
	98	movdqu xmm2, [arg2+16*2]
	99	movdqu xmm3, [arg2+16*3]
	100	movdqu xmm4, [arg2+16*4]
	101	movdqu xmm5, [arg2+16*5]
	102	movdqu xmm6, [arg2+16*6]
	103	movdqu xmm7, [arg2+16*7]
	104
	105	; XOR the initial_crc value
	106	pxor xmm0, xmm10
	107	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
	108	;imm value of pclmulqdq instruction will determine which constant to use
	109	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	110	; we subtract 256 instead of 128 to save one instruction from the loop
	111	sub arg3, 256
	112
	113	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	114	; loop will fold 128B at a time until we have 128+y Bytes of buffer
	115
	116
	117	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
	118	_fold_128_B_loop:
	119
	120	; update the buffer pointer
	121	add arg2, 128
	122
	123	prefetchnta [arg2+fetch_dist+0]
	124	movdqu xmm9, [arg2+16*0]
	125	movdqu xmm12, [arg2+16*1]
	126	movdqa xmm8, xmm0
	127	movdqa xmm13, xmm1
	128	pclmulqdq xmm0, xmm10, 0x10
	129	pclmulqdq xmm8, xmm10 , 0x1
	130	pclmulqdq xmm1, xmm10, 0x10
	131	pclmulqdq xmm13, xmm10 , 0x1
	132	pxor xmm0, xmm9
133	xorps xmm0, xmm8
134	pxor xmm1, xmm12
135	xorps xmm1, xmm13
136
137	prefetchnta [arg2+fetch_dist+32]
138	movdqu xmm9, [arg2+16*2]
139	movdqu xmm12, [arg2+16*3]
140	movdqa xmm8, xmm2
141	movdqa xmm13, xmm3
142	pclmulqdq xmm2, xmm10, 0x10
143	pclmulqdq xmm8, xmm10 , 0x1
144	pclmulqdq xmm3, xmm10, 0x10
145	pclmulqdq xmm13, xmm10 , 0x1
146	pxor xmm2, xmm9
147	xorps xmm2, xmm8
148	pxor xmm3, xmm12
149	xorps xmm3, xmm13
150
151	prefetchnta [arg2+fetch_dist+64]
152	movdqu xmm9, [arg2+16*4]
153	movdqu xmm12, [arg2+16*5]
154	movdqa xmm8, xmm4
155	movdqa xmm13, xmm5
156	pclmulqdq xmm4, xmm10, 0x10
157	pclmulqdq xmm8, xmm10 , 0x1
158	pclmulqdq xmm5, xmm10, 0x10
159	pclmulqdq xmm13, xmm10 , 0x1
160	pxor xmm4, xmm9
161	xorps xmm4, xmm8
162	pxor xmm5, xmm12
163	xorps xmm5, xmm13
164
165	prefetchnta [arg2+fetch_dist+96]
166	movdqu xmm9, [arg2+16*6]
167	movdqu xmm12, [arg2+16*7]
168	movdqa xmm8, xmm6
169	movdqa xmm13, xmm7
170	pclmulqdq xmm6, xmm10, 0x10
171	pclmulqdq xmm8, xmm10 , 0x1
172	pclmulqdq xmm7, xmm10, 0x10
173	pclmulqdq xmm13, xmm10 , 0x1
174	pxor xmm6, xmm9
175	xorps xmm6, xmm8
176	pxor xmm7, xmm12
177	xorps xmm7, xmm13
178
179	sub arg3, 128
180
181	; check if there is another 128B in the buffer to be able to fold
182	jge _fold_128_B_loop
183	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184
185	add arg2, 128
186	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
187	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
188
189
190	; fold the 8 xmm registers to 1 xmm register with different constants
191	; xmm0 to xmm7
192	movdqa xmm10, [rk9]
193	movdqa xmm8, xmm0
194	pclmulqdq xmm0, xmm10, 0x1
195	pclmulqdq xmm8, xmm10, 0x10
196	pxor xmm7, xmm8
197	xorps xmm7, xmm0
198	;xmm1 to xmm7
199	movdqa xmm10, [rk11]
200	movdqa xmm8, xmm1
201	pclmulqdq xmm1, xmm10, 0x1
202	pclmulqdq xmm8, xmm10, 0x10
203	pxor xmm7, xmm8
204	xorps xmm7, xmm1
205
206	movdqa xmm10, [rk13]
207	movdqa xmm8, xmm2
208	pclmulqdq xmm2, xmm10, 0x1
209	pclmulqdq xmm8, xmm10, 0x10
210	pxor xmm7, xmm8
211	pxor xmm7, xmm2
212
213	movdqa xmm10, [rk15]
214	movdqa xmm8, xmm3
215	pclmulqdq xmm3, xmm10, 0x1
216	pclmulqdq xmm8, xmm10, 0x10
217	pxor xmm7, xmm8
218	xorps xmm7, xmm3
219
220	movdqa xmm10, [rk17]
221	movdqa xmm8, xmm4
222	pclmulqdq xmm4, xmm10, 0x1
223	pclmulqdq xmm8, xmm10, 0x10
224	pxor xmm7, xmm8
225	pxor xmm7, xmm4
226
227	movdqa xmm10, [rk19]
228	movdqa xmm8, xmm5
229	pclmulqdq xmm5, xmm10, 0x1
230	pclmulqdq xmm8, xmm10, 0x10
231	pxor xmm7, xmm8
232	xorps xmm7, xmm5
233	; xmm6 to xmm7
234	movdqa xmm10, [rk1]
235	movdqa xmm8, xmm6
236	pclmulqdq xmm6, xmm10, 0x1
237	pclmulqdq xmm8, xmm10, 0x10
238	pxor xmm7, xmm8
239	pxor xmm7, xmm6
240
241
242	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
243	; instead of a cmp instruction, we use the negative flag with the jl instruction
244	add arg3, 128-16
245	jl _final_reduction_for_128
246
247	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
248	; we can fold 16 bytes at a time if y>=16
249	; continue folding 16B at a time
250
251	_16B_reduction_loop:
252	movdqa xmm8, xmm7
253	pclmulqdq xmm7, xmm10, 0x1
254	pclmulqdq xmm8, xmm10, 0x10
255	pxor xmm7, xmm8
256	movdqu xmm0, [arg2]
257	pxor xmm7, xmm0
258	add arg2, 16
259	sub arg3, 16
260	; instead of a cmp instruction, we utilize the flags with the jge instruction
261	; equivalent of: cmp arg3, 16-16
262	; check if there is any more 16B in the buffer to be able to fold
263	jge _16B_reduction_loop
264
265	;now we have 16+z bytes left to reduce, where 0<= z < 16.
266	;first, we reduce the data in the xmm7 register
267
268
269	_final_reduction_for_128:
270	add arg3, 16
271	je _128_done
272	; here we are getting data that is less than 16 bytes.
273	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
274	; after that the registers need to be adjusted.
275	_get_last_two_xmms:
276
277
278	movdqa xmm2, xmm7
279	movdqu xmm1, [arg2 - 16 + arg3]
280
281	; get rid of the extra data that was loaded before
282	; load the shift constant
283	lea rax, [pshufb_shf_table]
284	add rax, arg3
285	movdqu xmm0, [rax]
286
287
288	pshufb xmm7, xmm0
289	pxor xmm0, [mask3]
290	pshufb xmm2, xmm0
291
292	pblendvb xmm2, xmm1 ;xmm0 is implicit
293	;;;;;;;;;;
294	movdqa xmm8, xmm7
295	pclmulqdq xmm7, xmm10, 0x1
296
297	pclmulqdq xmm8, xmm10, 0x10
298	pxor xmm7, xmm8
299	pxor xmm7, xmm2
300
301	_128_done:
302	; compute crc of a 128-bit value
303	movdqa xmm10, [rk5]
304	movdqa xmm0, xmm7
305
306	;64b fold
307	pclmulqdq xmm7, xmm10, 0
308	psrldq xmm0, 8
309	pxor xmm7, xmm0
310
311	;barrett reduction
312	_barrett:
313	movdqa xmm1, xmm7
314	movdqa xmm10, [rk7]
315
316	pclmulqdq xmm7, xmm10, 0
317	movdqa xmm2, xmm7
318	pclmulqdq xmm7, xmm10, 0x10
319	pslldq xmm2, 8
320	pxor xmm7, xmm2
321	pxor xmm7, xmm1
322	pextrq rax, xmm7, 1
323
324	_cleanup:
325	; return c ^ 0xffffffff, ffffffffL;
326	not rax
327
328
329	%ifidn __OUTPUT_FORMAT__, win64
330	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
331	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
332	movdqa xmm8, [rsp + XMM_SAVE + 16*2]
333	movdqa xmm9, [rsp + XMM_SAVE + 16*3]
334	movdqa xmm10, [rsp + XMM_SAVE + 16*4]
335	movdqa xmm11, [rsp + XMM_SAVE + 16*5]
336	movdqa xmm12, [rsp + XMM_SAVE + 16*6]
337	movdqa xmm13, [rsp + XMM_SAVE + 16*7]
338	%endif
339	add rsp, VARIABLE_OFFSET
340	ret
341
342	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346
347	align 16
348	_less_than_256:
349
350	; check if there is enough buffer to be able to fold 16B at a time
351	cmp arg3, 32
352	jl _less_than_32
353
354	; if there is, load the constants
355	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
356
357	movq xmm0, arg1 ; get the initial crc value
358	movdqu xmm7, [arg2] ; load the plaintext
359	pxor xmm7, xmm0
360
361	; update the buffer pointer
362	add arg2, 16
363
364	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
365	sub arg3, 32
366
367	jmp _16B_reduction_loop
368
369	align 16
370	_less_than_32:
371	; mov initial crc to the return value. this is necessary for zero-length buffers.
372	mov rax, arg1
373	test arg3, arg3
374	je _cleanup
375
376	movq xmm0, arg1 ; get the initial crc value
377
378	cmp arg3, 16
379	je _exact_16_left
380	jl _less_than_16_left
381
382	movdqu xmm7, [arg2] ; load the plaintext
383	pxor xmm7, xmm0 ; xor the initial crc value
384	add arg2, 16
385	sub arg3, 16
386	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
387	jmp _get_last_two_xmms
388
389
390	align 16
391	_less_than_16_left:
392	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
393
394	pxor xmm1, xmm1
395	mov r11, rsp
396	movdqa [r11], xmm1
397
398	; backup the counter value
399	mov r9, arg3
400	cmp arg3, 8
401	jl _less_than_8_left
402
403	; load 8 Bytes
404	mov rax, [arg2]
405	mov [r11], rax
406	add r11, 8
407	sub arg3, 8
408	add arg2, 8
409	_less_than_8_left:
410
411	cmp arg3, 4
412	jl _less_than_4_left
413
414	; load 4 Bytes
415	mov eax, [arg2]
416	mov [r11], eax
417	add r11, 4
418	sub arg3, 4
419	add arg2, 4
420	_less_than_4_left:
421
422	cmp arg3, 2
423	jl _less_than_2_left
424
425	; load 2 Bytes
426	mov ax, [arg2]
427	mov [r11], ax
428	add r11, 2
429	sub arg3, 2
430	add arg2, 2
431	_less_than_2_left:
432	cmp arg3, 1
433	jl _zero_left
434
435	; load 1 Byte
436	mov al, [arg2]
437	mov [r11], al
438
439	_zero_left:
440	movdqa xmm7, [rsp]
441	pxor xmm7, xmm0 ; xor the initial crc value
442
443	lea rax,[pshufb_shf_table]
444
445	cmp r9, 8
446	jl _end_1to7
447
448	_end_8to15:
449	movdqu xmm0, [rax + r9]
450	pshufb xmm7,xmm0
451	jmp _128_done
452
453	_end_1to7:
454	; Left shift (8-length) bytes in XMM
455	movdqu xmm0, [rax + r9 + 8]
456	pshufb xmm7,xmm0
457
458	jmp _barrett
459
460	align 16
461	_exact_16_left:
462	movdqu xmm7, [arg2]
463	pxor xmm7, xmm0 ; xor the initial crc value
464
465	jmp _128_done
466
467	section .data
468
469	; precomputed constants
470	align 16
471	; rk7 = floor(2^128/Q)
472	; rk8 = Q
473	rk1:
474	DQ 0x381d0015c96f4444
475	rk2:
476	DQ 0xd9d7be7d505da32c
477	rk3:
478	DQ 0x768361524d29ed0b
479	rk4:
480	DQ 0xcc26fa7c57f8054c
481	rk5:
482	DQ 0x381d0015c96f4444
483	rk6:
484	DQ 0x0000000000000000
485	rk7:
486	DQ 0x3e6cfa329aef9f77
487	rk8:
488	DQ 0x2b5926535897936a
489	rk9:
490	DQ 0x5bc94ba8e2087636
491	rk10:
492	DQ 0x6cf09c8f37710b75
493	rk11:
494	DQ 0x3885fd59e440d95a
495	rk12:
496	DQ 0xbccba3936411fb7e
497	rk13:
498	DQ 0xe4dd0d81cbfce585
499	rk14:
500	DQ 0xb715e37b96ed8633
501	rk15:
502	DQ 0xf49784a634f014e4
503	rk16:
504	DQ 0xaf86efb16d9ab4fb
505	rk17:
506	DQ 0x7b3211a760160db8
507	rk18:
508	DQ 0xa062b2319d66692f
509	rk19:
510	DQ 0xef3d1d18ed889ed2
511	rk20:
512	DQ 0x6ba4d760ab38201e
513
514	pshufb_shf_table:
515	; use these values for shift constants for the pshufb instruction
516	; different alignments result in values as shown:
517	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
518	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
519	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
520	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
521	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
522	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
523	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
524	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
525	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
526	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
527	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
528	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
529	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
530	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
531	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
532	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
533	dq 0x0706050403020100, 0x000e0d0c0b0a0908
534
535
536	mask:
537	dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
538	mask2:
539	dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
540	mask3:
541	dq 0x8080808080808080, 0x8080808080808080
542
543	;;; func core, ver, snum
544	slversion crc64_jones_refl_by8, 01, 00, 0029