[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by4.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;       Function API:
;       UINT32 crc32_ieee_by4(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
	%xdefine        arg1 rcx
	%xdefine        arg2 rdx
	%xdefine        arg3 r8

	%xdefine        arg1_low32 ecx
%else
	%xdefine        arg1 rdi
	%xdefine        arg2 rsi
	%xdefine        arg3 rdx

	%xdefine        arg1_low32 edi
%endif

%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*4+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
global	crc32_ieee_by4:function
crc32_ieee_by4:

	not arg1_low32

	sub rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	movdqa [rsp + XMM_SAVE + 16*0],xmm6
	movdqa [rsp + XMM_SAVE + 16*1],xmm7
%endif

	; check if smaller than 128B
	cmp arg3, 128
	jl _less_than_128


	; load the initial crc value
	movd xmm6, arg1_low32					; initial crc
	; crc value does not need to be byte-reflected, but it needs to be
	; moved to the high part of the register.
	; because data will be byte-reflected and will align with initial
	; crc at correct place.
	pslldq xmm6, 12


	movdqa xmm7, [SHUF_MASK]
	; receive the initial 64B data, xor the initial crc value
	movdqu xmm0, [arg2]
	movdqu xmm1, [arg2+16]
	movdqu xmm2, [arg2+32]
	movdqu xmm3, [arg2+48]


	pshufb xmm0, xmm7
	; XOR the initial_crc value
	pxor xmm0, xmm6
	pshufb xmm1, xmm7
	pshufb xmm2, xmm7
	pshufb xmm3, xmm7

	movdqa xmm6, [rk3]	; k3=2^480 mod POLY << 32
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;we subtract 128 instead of 64 to save one instruction from the loop
	sub	arg3, 128

	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
	;  have 64+y Bytes of buffer


	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
_fold_64_B_loop:

	;update the buffer pointer
	add arg2, 64

	prefetchnta [arg2+fetch_dist+0]
	movdqa xmm4, xmm0
	movdqa xmm5, xmm1

	pclmulqdq xmm0, xmm6 , 0x11
	pclmulqdq xmm1, xmm6 , 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pclmulqdq xmm5, xmm6, 0x0

	pxor xmm0, xmm4
   	pxor xmm1, xmm5

	prefetchnta [arg2+fetch_dist+32]
	movdqa xmm4, xmm2
	movdqa xmm5, xmm3

	pclmulqdq xmm2, xmm6, 0x11
	pclmulqdq xmm3, xmm6, 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pclmulqdq xmm5, xmm6, 0x0

	pxor xmm2, xmm4
	pxor xmm3, xmm5

	movdqu xmm4, [arg2]
	movdqu xmm5, [arg2+16]
	pshufb xmm4, xmm7
	pshufb xmm5, xmm7
	pxor xmm0, xmm4
	pxor xmm1, xmm5

	movdqu xmm4, [arg2+32]
	movdqu xmm5, [arg2+48]
	pshufb xmm4, xmm7
	pshufb xmm5, xmm7

	pxor xmm2, xmm4
	pxor xmm3, xmm5

	sub	arg3, 64

	; check if there is another 64B in the buffer to be able to fold
	jge	_fold_64_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add arg2, 64
	;at this point, the arg2 is pointing at the last y Bytes of the buffer
	; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	movdqa xmm6, [rk1]		;k1

	; fold the 4 xmm registers to 1 xmm register with different constants
	movdqa xmm4, xmm0
	pclmulqdq xmm0, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm1, xmm4
	xorps xmm1, xmm0

	movdqa xmm4, xmm1
	pclmulqdq xmm1, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm2, xmm4
	xorps xmm2, xmm1

	movdqa xmm4, xmm2
	pclmulqdq xmm2, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	pxor xmm3, xmm2


	;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add arg3, 64-16
	jl _final_reduction_for_128

; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time

_16B_reduction_loop:
	movdqa xmm4, xmm3
	pclmulqdq xmm3, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	movdqu xmm0, [arg2]
	pshufb xmm0, xmm7
	pxor xmm3, xmm0
	add arg2, 16
	sub arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge _16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm3 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add arg3, 16
	je _128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa xmm2, xmm3

	movdqu xmm1, [arg2 - 16 + arg3]
	pshufb xmm1, xmm7

	shl arg3, 4
	lea rax, [pshufb_shf_table + 15*16]
	sub rax, arg3
	movdqu xmm0, [rax]

	pshufb	xmm2, xmm0

	pxor xmm0, [mask3]

	pshufb	xmm3, xmm0

	pblendvb xmm1, xmm2	;xmm0 is implicit

	movdqa xmm2, xmm1

	movdqa xmm4, xmm3
	pclmulqdq xmm3, xmm6, 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	pxor xmm3, xmm2

_128_done:

	movdqa xmm6, [rk5]
	movdqa xmm0, xmm3

	;64b fold
	pclmulqdq xmm3, xmm6, 0x1
	pslldq xmm0, 8
	pxor xmm3, xmm0

	;32b fold
	movdqa xmm0, xmm3

	pand xmm0, [mask4]

	psrldq xmm3, 12
	pclmulqdq xmm3, xmm6, 0x10
	pxor xmm3, xmm0

	;barrett reduction
_barrett:
	movdqa xmm6, [rk7]
	movdqa xmm0, xmm3
	pclmulqdq xmm3, xmm6, 0x01
	pslldq xmm3, 4
	pclmulqdq xmm3, xmm6, 0x11

	pslldq xmm3, 4
	pxor xmm3, xmm0
	pextrd eax, xmm3,1

_cleanup:
	not eax
%ifidn __OUTPUT_FORMAT__, win64
	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
%endif
	add rsp,VARIABLE_OFFSET


	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_128:

	;check if there is enough buffer to be able to fold 16B at a time
	cmp arg3, 32
	jl _less_than_32
	movdqa xmm7, [SHUF_MASK]

	;if there is, load the constants
	movdqa xmm6, [rk1]		;k1

	movd xmm0, arg1_low32
	pslldq xmm0, 12
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0


	;update the buffer pointer
	add arg2, 16

	;update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub arg3, 32

	jmp _16B_reduction_loop


align 16
_less_than_32:
	mov eax, arg1_low32
	test arg3, arg3
	je _cleanup

	movdqa xmm7, [SHUF_MASK]

	movd xmm0, arg1_low32
	pslldq xmm0, 12

	cmp arg3, 16
	je _exact_16_left
	jl _less_than_16_left
	movd xmm0, arg1_low32
	pslldq xmm0, 12
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0
	add arg2, 16
	sub arg3, 16
	movdqa xmm6, [rk1]		;k1
	jmp _get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor xmm1, xmm1
	mov r11, rsp
	movdqa [r11], xmm1


	cmp arg3, 4
	jl _only_less_than_4

	mov r9, arg3


	cmp arg3, 8
	jl _less_than_8_left
	mov rax, [arg2]
	mov [r11], rax
	add r11, 8
	sub arg3, 8
	add arg2, 8
_less_than_8_left:

	cmp arg3, 4
	jl _less_than_4_left
	mov eax, [arg2]
	mov [r11], eax
	add r11, 4
	sub arg3, 4
	add arg2, 4
_less_than_4_left:

	cmp arg3, 2
	jl _less_than_2_left
	mov ax, [arg2]
	mov [r11], ax
	add r11, 2
	sub arg3, 2
	add arg2, 2
_less_than_2_left:
	cmp arg3, 1
	jl _zero_left

	mov al, [arg2]
	mov [r11], al

_zero_left:
	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	shl r9, 4
	lea rax, [pshufb_shf_table + 15*16]
	sub rax, r9
	movdqu xmm0, [rax]
	pxor xmm0, [mask3]

	pshufb xmm3, xmm0
	jmp _128_done

align 16
_exact_16_left:
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	jmp _128_done

_only_less_than_4:
	cmp arg3, 3
	jl _only_less_than_3
	mov al, [arg2]
	mov [r11], al

	mov al, [arg2+1]
	mov [r11+1], al

	mov al, [arg2+2]
	mov [r11+2], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 5

	jmp _barrett
_only_less_than_3:
	cmp arg3, 2
	jl _only_less_than_2
	mov al, [arg2]
	mov [r11], al

	mov al, [arg2+1]
	mov [r11+1], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 6

	jmp _barrett
_only_less_than_2:
	mov al, [arg2]
	mov [r11], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 7

	jmp _barrett
; precomputed constants
section .data

align 16
rk1:
DQ 0xf200aa6600000000
rk2:
DQ 0x17d3315d00000000
rk3:
DQ 0xd3504ec700000000
rk4:
DQ 0x57a8445500000000
rk5:
DQ 0xf200aa6600000000
rk6:
DQ 0x490d678d00000000
rk7:
DQ 0x0000000104d101df
rk8:
DQ 0x0000000104c11db7
mask:
dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq 0x8080808080808080, 0x8080808080808080
mask4:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
	align 32
pshufb_shf_table:

	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1

	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2

	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3

	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4

	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5

	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6

	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7

	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8

	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9

	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10

	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11

	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12

	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13

	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14

	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15


SHUF_MASK	dq 0x08090A0B0C0D0E0F, 0x0001020304050607

;;;       func             core, ver, snum
slversion crc32_ieee_by4, 05,   02,  0017
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29	;
	30	; Function API:
	31	; UINT32 crc32_ieee_by4(
	32	; UINT32 init_crc, //initial CRC value, 32 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://download.intel.com/design/intarch/papers/323102.pdf
	44	;
	45
	46	%include "reg_sizes.asm"
	47
224ce89b WB	48	%define fetch_dist 1024
224ce89b WB	49
7c673cae FG	50	[bits 64]
	51	default rel
	52
	53	section .text
	54
	55	%ifidn __OUTPUT_FORMAT__, win64
	56	%xdefine arg1 rcx
	57	%xdefine arg2 rdx
	58	%xdefine arg3 r8
	59
	60	%xdefine arg1_low32 ecx
	61	%else
	62	%xdefine arg1 rdi
	63	%xdefine arg2 rsi
	64	%xdefine arg3 rdx
	65
	66	%xdefine arg1_low32 edi
	67	%endif
	68
	69	%ifidn __OUTPUT_FORMAT__, win64
	70	%define XMM_SAVE 16*2
	71	%define VARIABLE_OFFSET 16*4+8
	72	%else
	73	%define VARIABLE_OFFSET 16*2+8
	74	%endif
	75
	76	align 16
	77	global crc32_ieee_by4:function
	78	crc32_ieee_by4:
	79
	80	not arg1_low32
	81
	82	sub rsp,VARIABLE_OFFSET
	83
	84	%ifidn __OUTPUT_FORMAT__, win64
	85	; push the xmm registers into the stack to maintain
	86	movdqa [rsp + XMM_SAVE + 16*0],xmm6
	87	movdqa [rsp + XMM_SAVE + 16*1],xmm7
	88	%endif
	89
	90	; check if smaller than 128B
	91	cmp arg3, 128
	92	jl _less_than_128
	93
	94
	95
	96	; load the initial crc value
	97	movd xmm6, arg1_low32 ; initial crc
	98	; crc value does not need to be byte-reflected, but it needs to be
	99	; moved to the high part of the register.
	100	; because data will be byte-reflected and will align with initial
	101	; crc at correct place.
	102	pslldq xmm6, 12
	103
	104
	105
	106	movdqa xmm7, [SHUF_MASK]
	107	; receive the initial 64B data, xor the initial crc value
	108	movdqu xmm0, [arg2]
	109	movdqu xmm1, [arg2+16]
	110	movdqu xmm2, [arg2+32]
	111	movdqu xmm3, [arg2+48]
	112
	113
114
115	pshufb xmm0, xmm7
116	; XOR the initial_crc value
117	pxor xmm0, xmm6
118	pshufb xmm1, xmm7
119	pshufb xmm2, xmm7
120	pshufb xmm3, xmm7
121
122	movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
123	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
124	;we subtract 128 instead of 64 to save one instruction from the loop
125	sub arg3, 128
126
127	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
128	; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
129	; have 64+y Bytes of buffer
130
131
132	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
133	_fold_64_B_loop:
134
135	;update the buffer pointer
136	add arg2, 64
137
224ce89b	138	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	139	movdqa xmm4, xmm0
	140	movdqa xmm5, xmm1
	141
	142	pclmulqdq xmm0, xmm6 , 0x11
	143	pclmulqdq xmm1, xmm6 , 0x11
	144
	145	pclmulqdq xmm4, xmm6, 0x0
	146	pclmulqdq xmm5, xmm6, 0x0
	147
	148	pxor xmm0, xmm4
	149	pxor xmm1, xmm5
	150
224ce89b	151	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	152	movdqa xmm4, xmm2
	153	movdqa xmm5, xmm3
	154
	155	pclmulqdq xmm2, xmm6, 0x11
	156	pclmulqdq xmm3, xmm6, 0x11
	157
	158	pclmulqdq xmm4, xmm6, 0x0
	159	pclmulqdq xmm5, xmm6, 0x0
	160
	161	pxor xmm2, xmm4
	162	pxor xmm3, xmm5
	163
	164	movdqu xmm4, [arg2]
	165	movdqu xmm5, [arg2+16]
	166	pshufb xmm4, xmm7
	167	pshufb xmm5, xmm7
	168	pxor xmm0, xmm4
	169	pxor xmm1, xmm5
	170
	171	movdqu xmm4, [arg2+32]
	172	movdqu xmm5, [arg2+48]
	173	pshufb xmm4, xmm7
	174	pshufb xmm5, xmm7
	175
	176	pxor xmm2, xmm4
	177	pxor xmm3, xmm5
	178
	179	sub arg3, 64
	180
	181	; check if there is another 64B in the buffer to be able to fold
	182	jge _fold_64_B_loop
	183	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	184
	185
	186	add arg2, 64
	187	;at this point, the arg2 is pointing at the last y Bytes of the buffer
	188	; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
	189
	190
	191	movdqa xmm6, [rk1] ;k1
	192
	193	; fold the 4 xmm registers to 1 xmm register with different constants
	194	movdqa xmm4, xmm0
	195	pclmulqdq xmm0, xmm6, 0x11
	196	pclmulqdq xmm4, xmm6, 0x0
	197	pxor xmm1, xmm4
	198	xorps xmm1, xmm0
	199
	200	movdqa xmm4, xmm1
	201	pclmulqdq xmm1, xmm6, 0x11
	202	pclmulqdq xmm4, xmm6, 0x0
	203	pxor xmm2, xmm4
	204	xorps xmm2, xmm1
	205
	206	movdqa xmm4, xmm2
	207	pclmulqdq xmm2, xmm6, 0x11
	208	pclmulqdq xmm4, xmm6, 0x0
	209	pxor xmm3, xmm4
	210	pxor xmm3, xmm2
	211
	212
	213	;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	214	; instead of a cmp instruction, we use the negative flag with the jl instruction
	215	add arg3, 64-16
216	jl _final_reduction_for_128
217
218	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
219	; we can fold 16 bytes at a time if y>=16
220	; continue folding 16B at a time
221
222	_16B_reduction_loop:
223	movdqa xmm4, xmm3
224	pclmulqdq xmm3, xmm6, 0x11
225	pclmulqdq xmm4, xmm6, 0x0
226	pxor xmm3, xmm4
227	movdqu xmm0, [arg2]
228	pshufb xmm0, xmm7
229	pxor xmm3, xmm0
230	add arg2, 16
231	sub arg3, 16
232	; instead of a cmp instruction, we utilize the flags with the jge instruction
233	; equivalent of: cmp arg3, 16-16
234	; check if there is any more 16B in the buffer to be able to fold
235	jge _16B_reduction_loop
236
237	;now we have 16+z bytes left to reduce, where 0<= z < 16.
238	;first, we reduce the data in the xmm3 register
239
240
241
242	_final_reduction_for_128:
243	; check if any more data to fold. If not, compute the CRC of the final 128 bits
244	add arg3, 16
245	je _128_done
246
247	; here we are getting data that is less than 16 bytes.
248	; since we know that there was data before the pointer, we can offset
249	; the input pointer before the actual point, to receive exactly 16 bytes.
250	; after that the registers need to be adjusted.
251	_get_last_two_xmms:
252	movdqa xmm2, xmm3
253
254	movdqu xmm1, [arg2 - 16 + arg3]
255	pshufb xmm1, xmm7
256
257	shl arg3, 4
258	lea rax, [pshufb_shf_table + 15*16]
259	sub rax, arg3
260	movdqu xmm0, [rax]
261
262	pshufb xmm2, xmm0
263
264	pxor xmm0, [mask3]
265
266	pshufb xmm3, xmm0
267
268	pblendvb xmm1, xmm2 ;xmm0 is implicit
269
270	movdqa xmm2, xmm1
271
272	movdqa xmm4, xmm3
273	pclmulqdq xmm3, xmm6, 0x11
274
275	pclmulqdq xmm4, xmm6, 0x0
276	pxor xmm3, xmm4
277	pxor xmm3, xmm2
278
279	_128_done:
280
281	movdqa xmm6, [rk5]
282	movdqa xmm0, xmm3
283
284	;64b fold
285	pclmulqdq xmm3, xmm6, 0x1
286	pslldq xmm0, 8
287	pxor xmm3, xmm0
288
289	;32b fold
290	movdqa xmm0, xmm3
291
292	pand xmm0, [mask4]
293
294	psrldq xmm3, 12
295	pclmulqdq xmm3, xmm6, 0x10
296	pxor xmm3, xmm0
297
298	;barrett reduction
299	_barrett:
300	movdqa xmm6, [rk7]
301	movdqa xmm0, xmm3
302	pclmulqdq xmm3, xmm6, 0x01
303	pslldq xmm3, 4
304	pclmulqdq xmm3, xmm6, 0x11
305
306	pslldq xmm3, 4
307	pxor xmm3, xmm0
308	pextrd eax, xmm3,1
309
310	_cleanup:
311	not eax
312	%ifidn __OUTPUT_FORMAT__, win64
313	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
314	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
315	%endif
316	add rsp,VARIABLE_OFFSET
317
318
319	ret
320
321
322
323
324
325
326
327	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
328	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331
332	align 16
333	_less_than_128:
334
335	;check if there is enough buffer to be able to fold 16B at a time
336	cmp arg3, 32
337	jl _less_than_32
338	movdqa xmm7, [SHUF_MASK]
339
340	;if there is, load the constants
341	movdqa xmm6, [rk1] ;k1
342
343	movd xmm0, arg1_low32
344	pslldq xmm0, 12
345	movdqu xmm3, [arg2]
346	pshufb xmm3, xmm7
347	pxor xmm3, xmm0
348
349
350	;update the buffer pointer
351	add arg2, 16
352
353	;update the counter. subtract 32 instead of 16 to save one instruction from the loop
354	sub arg3, 32
355
356	jmp _16B_reduction_loop
357
358
359	align 16
360	_less_than_32:
361	mov eax, arg1_low32
362	test arg3, arg3
363	je _cleanup
364
365	movdqa xmm7, [SHUF_MASK]
366
367	movd xmm0, arg1_low32
368	pslldq xmm0, 12
369
370	cmp arg3, 16
371	je _exact_16_left
372	jl _less_than_16_left
373	movd xmm0, arg1_low32
374	pslldq xmm0, 12
375	movdqu xmm3, [arg2]
376	pshufb xmm3, xmm7
377	pxor xmm3, xmm0
378	add arg2, 16
379	sub arg3, 16
380	movdqa xmm6, [rk1] ;k1
381	jmp _get_last_two_xmms
382
383
384	align 16
385	_less_than_16_left:
386	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
387
388	pxor xmm1, xmm1
389	mov r11, rsp
390	movdqa [r11], xmm1
391
392
393
394	cmp arg3, 4
395	jl _only_less_than_4
396
397	mov r9, arg3
398
399
400	cmp arg3, 8
401	jl _less_than_8_left
402	mov rax, [arg2]
403	mov [r11], rax
404	add r11, 8
405	sub arg3, 8
406	add arg2, 8
407	_less_than_8_left:
408
409	cmp arg3, 4
410	jl _less_than_4_left
411	mov eax, [arg2]
412	mov [r11], eax
413	add r11, 4
414	sub arg3, 4
415	add arg2, 4
416	_less_than_4_left:
417
418	cmp arg3, 2
419	jl _less_than_2_left
420	mov ax, [arg2]
421	mov [r11], ax
422	add r11, 2
423	sub arg3, 2
424	add arg2, 2
425	_less_than_2_left:
426	cmp arg3, 1
427	jl _zero_left
428
429	mov al, [arg2]
430	mov [r11], al
431
432	_zero_left:
433	movdqa xmm3, [rsp]
434	pshufb xmm3, xmm7
435	pxor xmm3, xmm0
436
437	shl r9, 4
438	lea rax, [pshufb_shf_table + 15*16]
439	sub rax, r9
440	movdqu xmm0, [rax]
441	pxor xmm0, [mask3]
442
443	pshufb xmm3, xmm0
444	jmp _128_done
445
446	align 16
447	_exact_16_left:
448	movdqu xmm3, [arg2]
449	pshufb xmm3, xmm7
450	pxor xmm3, xmm0
451
452	jmp _128_done
453
454	_only_less_than_4:
455	cmp arg3, 3
456	jl _only_less_than_3
457	mov al, [arg2]
458	mov [r11], al
459
460	mov al, [arg2+1]
461	mov [r11+1], al
462
463	mov al, [arg2+2]
464	mov [r11+2], al
465
466	movdqa xmm3, [rsp]
467	pshufb xmm3, xmm7
468	pxor xmm3, xmm0
469
470	psrldq xmm3, 5
471
472	jmp _barrett
473	_only_less_than_3:
474	cmp arg3, 2
475	jl _only_less_than_2
476	mov al, [arg2]
477	mov [r11], al
478
479	mov al, [arg2+1]
480	mov [r11+1], al
481
482	movdqa xmm3, [rsp]
483	pshufb xmm3, xmm7
484	pxor xmm3, xmm0
485
486	psrldq xmm3, 6
487
488	jmp _barrett
489	_only_less_than_2:
490	mov al, [arg2]
491	mov [r11], al
492
493	movdqa xmm3, [rsp]
494	pshufb xmm3, xmm7
495	pxor xmm3, xmm0
496
497	psrldq xmm3, 7
498
499	jmp _barrett
500	; precomputed constants
501	section .data
502
503	align 16
504	rk1:
505	DQ 0xf200aa6600000000
506	rk2:
507	DQ 0x17d3315d00000000
508	rk3:
509	DQ 0xd3504ec700000000
510	rk4:
511	DQ 0x57a8445500000000
512	rk5:
513	DQ 0xf200aa6600000000
514	rk6:
515	DQ 0x490d678d00000000
516	rk7:
517	DQ 0x0000000104d101df
518	rk8:
519	DQ 0x0000000104c11db7
520	mask:
521	dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
522	mask2:
523	dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
524	mask3:
525	dq 0x8080808080808080, 0x8080808080808080
526	mask4:
527	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
528	align 32
529	pshufb_shf_table:
530
531	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
532
533	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
534
535	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
536
537	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
538
539	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
540
541	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
542
543	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
544
545	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
546
547	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
548
549	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
550
551	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
552
553	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
554
555	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
556
557	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
558
559	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
560
561
562	SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
563
564	;;; func core, ver, snum
565	slversion crc32_ieee_by4, 05, 02, 0017