[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by4.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;       Function API:
;       UINT32 crc32_ieee_by4(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
	%xdefine        arg1 rcx
	%xdefine        arg2 rdx
	%xdefine        arg3 r8

	%xdefine        arg1_low32 ecx
%else
	%xdefine        arg1 rdi
	%xdefine        arg2 rsi
	%xdefine        arg3 rdx

	%xdefine        arg1_low32 edi
%endif

%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*4+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
mk_global 	crc32_ieee_by4, function
crc32_ieee_by4:
	endbranch

	not arg1_low32

	sub rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	movdqa [rsp + XMM_SAVE + 16*0],xmm6
	movdqa [rsp + XMM_SAVE + 16*1],xmm7
%endif

	; check if smaller than 128B
	cmp arg3, 128
	jl _less_than_128


	; load the initial crc value
	movd xmm6, arg1_low32					; initial crc
	; crc value does not need to be byte-reflected, but it needs to be
	; moved to the high part of the register.
	; because data will be byte-reflected and will align with initial
	; crc at correct place.
	pslldq xmm6, 12


	movdqa xmm7, [SHUF_MASK]
	; receive the initial 64B data, xor the initial crc value
	movdqu xmm0, [arg2]
	movdqu xmm1, [arg2+16]
	movdqu xmm2, [arg2+32]
	movdqu xmm3, [arg2+48]


	pshufb xmm0, xmm7
	; XOR the initial_crc value
	pxor xmm0, xmm6
	pshufb xmm1, xmm7
	pshufb xmm2, xmm7
	pshufb xmm3, xmm7

	movdqa xmm6, [rk3]	; k3=2^480 mod POLY << 32
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	;we subtract 128 instead of 64 to save one instruction from the loop
	sub	arg3, 128

	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
	;  have 64+y Bytes of buffer


	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
_fold_64_B_loop:

	;update the buffer pointer
	add arg2, 64

	prefetchnta [arg2+fetch_dist+0]
	movdqa xmm4, xmm0
	movdqa xmm5, xmm1

	pclmulqdq xmm0, xmm6 , 0x11
	pclmulqdq xmm1, xmm6 , 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pclmulqdq xmm5, xmm6, 0x0

	pxor xmm0, xmm4
   	pxor xmm1, xmm5

	prefetchnta [arg2+fetch_dist+32]
	movdqa xmm4, xmm2
	movdqa xmm5, xmm3

	pclmulqdq xmm2, xmm6, 0x11
	pclmulqdq xmm3, xmm6, 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pclmulqdq xmm5, xmm6, 0x0

	pxor xmm2, xmm4
	pxor xmm3, xmm5

	movdqu xmm4, [arg2]
	movdqu xmm5, [arg2+16]
	pshufb xmm4, xmm7
	pshufb xmm5, xmm7
	pxor xmm0, xmm4
	pxor xmm1, xmm5

	movdqu xmm4, [arg2+32]
	movdqu xmm5, [arg2+48]
	pshufb xmm4, xmm7
	pshufb xmm5, xmm7

	pxor xmm2, xmm4
	pxor xmm3, xmm5

	sub	arg3, 64

	; check if there is another 64B in the buffer to be able to fold
	jge	_fold_64_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add arg2, 64
	;at this point, the arg2 is pointing at the last y Bytes of the buffer
	; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	movdqa xmm6, [rk1]		;k1

	; fold the 4 xmm registers to 1 xmm register with different constants
	movdqa xmm4, xmm0
	pclmulqdq xmm0, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm1, xmm4
	xorps xmm1, xmm0

	movdqa xmm4, xmm1
	pclmulqdq xmm1, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm2, xmm4
	xorps xmm2, xmm1

	movdqa xmm4, xmm2
	pclmulqdq xmm2, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	pxor xmm3, xmm2


	;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add arg3, 64-16
	jl _final_reduction_for_128

; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
; we can fold 16 bytes at a time if y>=16
; continue folding 16B at a time

_16B_reduction_loop:
	movdqa xmm4, xmm3
	pclmulqdq xmm3, xmm6, 0x11
	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	movdqu xmm0, [arg2]
	pshufb xmm0, xmm7
	pxor xmm3, xmm0
	add arg2, 16
	sub arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge _16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm3 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add arg3, 16
	je _128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa xmm2, xmm3

	movdqu xmm1, [arg2 - 16 + arg3]
	pshufb xmm1, xmm7

	shl arg3, 4
	lea rax, [pshufb_shf_table + 15*16]
	sub rax, arg3
	movdqu xmm0, [rax]

	pshufb	xmm2, xmm0

	pxor xmm0, [mask3]

	pshufb	xmm3, xmm0

	pblendvb xmm1, xmm2	;xmm0 is implicit

	movdqa xmm2, xmm1

	movdqa xmm4, xmm3
	pclmulqdq xmm3, xmm6, 0x11

	pclmulqdq xmm4, xmm6, 0x0
	pxor xmm3, xmm4
	pxor xmm3, xmm2

_128_done:

	movdqa xmm6, [rk5]
	movdqa xmm0, xmm3

	;64b fold
	pclmulqdq xmm3, xmm6, 0x1
	pslldq xmm0, 8
	pxor xmm3, xmm0

	;32b fold
	movdqa xmm0, xmm3

	pand xmm0, [mask4]

	psrldq xmm3, 12
	pclmulqdq xmm3, xmm6, 0x10
	pxor xmm3, xmm0

	;barrett reduction
_barrett:
	movdqa xmm6, [rk7]
	movdqa xmm0, xmm3
	pclmulqdq xmm3, xmm6, 0x01
	pslldq xmm3, 4
	pclmulqdq xmm3, xmm6, 0x11

	pslldq xmm3, 4
	pxor xmm3, xmm0
	pextrd eax, xmm3,1

_cleanup:
	not eax
%ifidn __OUTPUT_FORMAT__, win64
	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
%endif
	add rsp,VARIABLE_OFFSET


	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_128:

	;check if there is enough buffer to be able to fold 16B at a time
	cmp arg3, 32
	jl _less_than_32
	movdqa xmm7, [SHUF_MASK]

	;if there is, load the constants
	movdqa xmm6, [rk1]		;k1

	movd xmm0, arg1_low32
	pslldq xmm0, 12
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0


	;update the buffer pointer
	add arg2, 16

	;update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub arg3, 32

	jmp _16B_reduction_loop


align 16
_less_than_32:
	mov eax, arg1_low32
	test arg3, arg3
	je _cleanup

	movdqa xmm7, [SHUF_MASK]

	movd xmm0, arg1_low32
	pslldq xmm0, 12

	cmp arg3, 16
	je _exact_16_left
	jl _less_than_16_left
	movd xmm0, arg1_low32
	pslldq xmm0, 12
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0
	add arg2, 16
	sub arg3, 16
	movdqa xmm6, [rk1]		;k1
	jmp _get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor xmm1, xmm1
	mov r11, rsp
	movdqa [r11], xmm1


	cmp arg3, 4
	jl _only_less_than_4

	mov r9, arg3


	cmp arg3, 8
	jl _less_than_8_left
	mov rax, [arg2]
	mov [r11], rax
	add r11, 8
	sub arg3, 8
	add arg2, 8
_less_than_8_left:

	cmp arg3, 4
	jl _less_than_4_left
	mov eax, [arg2]
	mov [r11], eax
	add r11, 4
	sub arg3, 4
	add arg2, 4
_less_than_4_left:

	cmp arg3, 2
	jl _less_than_2_left
	mov ax, [arg2]
	mov [r11], ax
	add r11, 2
	sub arg3, 2
	add arg2, 2
_less_than_2_left:
	cmp arg3, 1
	jl _zero_left

	mov al, [arg2]
	mov [r11], al

_zero_left:
	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	shl r9, 4
	lea rax, [pshufb_shf_table + 15*16]
	sub rax, r9
	movdqu xmm0, [rax]
	pxor xmm0, [mask3]

	pshufb xmm3, xmm0
	jmp _128_done

align 16
_exact_16_left:
	movdqu xmm3, [arg2]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	jmp _128_done

_only_less_than_4:
	cmp arg3, 3
	jl _only_less_than_3
	mov al, [arg2]
	mov [r11], al

	mov al, [arg2+1]
	mov [r11+1], al

	mov al, [arg2+2]
	mov [r11+2], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 5

	jmp _barrett
_only_less_than_3:
	cmp arg3, 2
	jl _only_less_than_2
	mov al, [arg2]
	mov [r11], al

	mov al, [arg2+1]
	mov [r11+1], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 6

	jmp _barrett
_only_less_than_2:
	mov al, [arg2]
	mov [r11], al

	movdqa xmm3, [rsp]
	pshufb xmm3, xmm7
	pxor xmm3, xmm0

	psrldq xmm3, 7

	jmp _barrett
; precomputed constants
section .data

align 16
rk1:
DQ 0xf200aa6600000000
rk2:
DQ 0x17d3315d00000000
rk3:
DQ 0xd3504ec700000000
rk4:
DQ 0x57a8445500000000
rk5:
DQ 0xf200aa6600000000
rk6:
DQ 0x490d678d00000000
rk7:
DQ 0x0000000104d101df
rk8:
DQ 0x0000000104c11db7
mask:
dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2:
dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3:
dq 0x8080808080808080, 0x8080808080808080
mask4:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
	align 32
pshufb_shf_table:

	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1

	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2

	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3

	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4

	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5

	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6

	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7

	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8

	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9

	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10

	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11

	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12

	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13

	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14

	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15


SHUF_MASK	dq 0x08090A0B0C0D0E0F, 0x0001020304050607

;;;       func             core, ver, snum
slversion crc32_ieee_by4, 05,   02,  0017
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29	;
	30	; Function API:
	31	; UINT32 crc32_ieee_by4(
	32	; UINT32 init_crc, //initial CRC value, 32 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://download.intel.com/design/intarch/papers/323102.pdf
	44	;
	45
	46	%include "reg_sizes.asm"
	47
224ce89b WB	48	%define fetch_dist 1024
224ce89b WB	49
7c673cae FG	50	[bits 64]
	51	default rel
	52
	53	section .text
	54
	55	%ifidn __OUTPUT_FORMAT__, win64
	56	%xdefine arg1 rcx
	57	%xdefine arg2 rdx
	58	%xdefine arg3 r8
	59
	60	%xdefine arg1_low32 ecx
	61	%else
	62	%xdefine arg1 rdi
	63	%xdefine arg2 rsi
	64	%xdefine arg3 rdx
	65
	66	%xdefine arg1_low32 edi
	67	%endif
	68
	69	%ifidn __OUTPUT_FORMAT__, win64
	70	%define XMM_SAVE 16*2
	71	%define VARIABLE_OFFSET 16*4+8
	72	%else
	73	%define VARIABLE_OFFSET 16*2+8
	74	%endif
	75
	76	align 16
20effc67	77	mk_global crc32_ieee_by4, function
7c673cae	78	crc32_ieee_by4:
20effc67	79	endbranch
7c673cae FG	80
	81	not arg1_low32
	82
	83	sub rsp,VARIABLE_OFFSET
	84
	85	%ifidn __OUTPUT_FORMAT__, win64
	86	; push the xmm registers into the stack to maintain
	87	movdqa [rsp + XMM_SAVE + 16*0],xmm6
	88	movdqa [rsp + XMM_SAVE + 16*1],xmm7
	89	%endif
	90
	91	; check if smaller than 128B
	92	cmp arg3, 128
	93	jl _less_than_128
	94
	95
	96
	97	; load the initial crc value
	98	movd xmm6, arg1_low32 ; initial crc
	99	; crc value does not need to be byte-reflected, but it needs to be
	100	; moved to the high part of the register.
	101	; because data will be byte-reflected and will align with initial
	102	; crc at correct place.
	103	pslldq xmm6, 12
	104
	105
	106
	107	movdqa xmm7, [SHUF_MASK]
	108	; receive the initial 64B data, xor the initial crc value
	109	movdqu xmm0, [arg2]
	110	movdqu xmm1, [arg2+16]
	111	movdqu xmm2, [arg2+32]
	112	movdqu xmm3, [arg2+48]
	113
	114
	115
	116	pshufb xmm0, xmm7
	117	; XOR the initial_crc value
	118	pxor xmm0, xmm6
	119	pshufb xmm1, xmm7
	120	pshufb xmm2, xmm7
	121	pshufb xmm3, xmm7
	122
	123	movdqa xmm6, [rk3] ; k3=2^480 mod POLY << 32
	124	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	125	;we subtract 128 instead of 64 to save one instruction from the loop
	126	sub arg3, 128
	127
	128	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	129	; buffer. The _fold_64_B_loop loop will fold 64B at a time until we
	130	; have 64+y Bytes of buffer
	131
	132
	133	; fold 64B at a time. This section of the code folds 4 xmm registers in parallel
	134	_fold_64_B_loop:
	135
	136	;update the buffer pointer
	137	add arg2, 64
	138
224ce89b	139	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	140	movdqa xmm4, xmm0
	141	movdqa xmm5, xmm1
	142
	143	pclmulqdq xmm0, xmm6 , 0x11
	144	pclmulqdq xmm1, xmm6 , 0x11
	145
	146	pclmulqdq xmm4, xmm6, 0x0
	147	pclmulqdq xmm5, xmm6, 0x0
	148
	149	pxor xmm0, xmm4
	150	pxor xmm1, xmm5
	151
224ce89b	152	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	153	movdqa xmm4, xmm2
	154	movdqa xmm5, xmm3
	155
	156	pclmulqdq xmm2, xmm6, 0x11
	157	pclmulqdq xmm3, xmm6, 0x11
	158
	159	pclmulqdq xmm4, xmm6, 0x0
	160	pclmulqdq xmm5, xmm6, 0x0
	161
	162	pxor xmm2, xmm4
	163	pxor xmm3, xmm5
	164
	165	movdqu xmm4, [arg2]
	166	movdqu xmm5, [arg2+16]
	167	pshufb xmm4, xmm7
	168	pshufb xmm5, xmm7
	169	pxor xmm0, xmm4
	170	pxor xmm1, xmm5
	171
	172	movdqu xmm4, [arg2+32]
	173	movdqu xmm5, [arg2+48]
	174	pshufb xmm4, xmm7
	175	pshufb xmm5, xmm7
	176
	177	pxor xmm2, xmm4
	178	pxor xmm3, xmm5
	179
	180	sub arg3, 64
	181
	182	; check if there is another 64B in the buffer to be able to fold
	183	jge _fold_64_B_loop
	184	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	185
	186
	187	add arg2, 64
	188	;at this point, the arg2 is pointing at the last y Bytes of the buffer
	189	; the 64B of data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
	190
	191
	192	movdqa xmm6, [rk1] ;k1
	193
	194	; fold the 4 xmm registers to 1 xmm register with different constants
	195	movdqa xmm4, xmm0
	196	pclmulqdq xmm0, xmm6, 0x11
	197	pclmulqdq xmm4, xmm6, 0x0
	198	pxor xmm1, xmm4
	199	xorps xmm1, xmm0
	200
	201	movdqa xmm4, xmm1
	202	pclmulqdq xmm1, xmm6, 0x11
	203	pclmulqdq xmm4, xmm6, 0x0
	204	pxor xmm2, xmm4
	205	xorps xmm2, xmm1
	206
	207	movdqa xmm4, xmm2
	208	pclmulqdq xmm2, xmm6, 0x11
	209	pclmulqdq xmm4, xmm6, 0x0
	210	pxor xmm3, xmm4
	211	pxor xmm3, xmm2
	212
	213
	214	;instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	215	; instead of a cmp instruction, we use the negative flag with the jl instruction
	216	add arg3, 64-16
217	jl _final_reduction_for_128
218
219	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm3 and the rest is in memory
220	; we can fold 16 bytes at a time if y>=16
221	; continue folding 16B at a time
222
223	_16B_reduction_loop:
224	movdqa xmm4, xmm3
225	pclmulqdq xmm3, xmm6, 0x11
226	pclmulqdq xmm4, xmm6, 0x0
227	pxor xmm3, xmm4
228	movdqu xmm0, [arg2]
229	pshufb xmm0, xmm7
230	pxor xmm3, xmm0
231	add arg2, 16
232	sub arg3, 16
233	; instead of a cmp instruction, we utilize the flags with the jge instruction
234	; equivalent of: cmp arg3, 16-16
235	; check if there is any more 16B in the buffer to be able to fold
236	jge _16B_reduction_loop
237
238	;now we have 16+z bytes left to reduce, where 0<= z < 16.
239	;first, we reduce the data in the xmm3 register
240
241
242
243	_final_reduction_for_128:
244	; check if any more data to fold. If not, compute the CRC of the final 128 bits
245	add arg3, 16
246	je _128_done
247
248	; here we are getting data that is less than 16 bytes.
249	; since we know that there was data before the pointer, we can offset
250	; the input pointer before the actual point, to receive exactly 16 bytes.
251	; after that the registers need to be adjusted.
252	_get_last_two_xmms:
253	movdqa xmm2, xmm3
254
255	movdqu xmm1, [arg2 - 16 + arg3]
256	pshufb xmm1, xmm7
257
258	shl arg3, 4
259	lea rax, [pshufb_shf_table + 15*16]
260	sub rax, arg3
261	movdqu xmm0, [rax]
262
263	pshufb xmm2, xmm0
264
265	pxor xmm0, [mask3]
266
267	pshufb xmm3, xmm0
268
269	pblendvb xmm1, xmm2 ;xmm0 is implicit
270
271	movdqa xmm2, xmm1
272
273	movdqa xmm4, xmm3
274	pclmulqdq xmm3, xmm6, 0x11
275
276	pclmulqdq xmm4, xmm6, 0x0
277	pxor xmm3, xmm4
278	pxor xmm3, xmm2
279
280	_128_done:
281
282	movdqa xmm6, [rk5]
283	movdqa xmm0, xmm3
284
285	;64b fold
286	pclmulqdq xmm3, xmm6, 0x1
287	pslldq xmm0, 8
288	pxor xmm3, xmm0
289
290	;32b fold
291	movdqa xmm0, xmm3
292
293	pand xmm0, [mask4]
294
295	psrldq xmm3, 12
296	pclmulqdq xmm3, xmm6, 0x10
297	pxor xmm3, xmm0
298
299	;barrett reduction
300	_barrett:
301	movdqa xmm6, [rk7]
302	movdqa xmm0, xmm3
303	pclmulqdq xmm3, xmm6, 0x01
304	pslldq xmm3, 4
305	pclmulqdq xmm3, xmm6, 0x11
306
307	pslldq xmm3, 4
308	pxor xmm3, xmm0
309	pextrd eax, xmm3,1
310
311	_cleanup:
312	not eax
313	%ifidn __OUTPUT_FORMAT__, win64
314	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
315	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
316	%endif
317	add rsp,VARIABLE_OFFSET
318
319
320	ret
321
322
323
324
325
326
327
328	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
329	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
330	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
331	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
332
333	align 16
334	_less_than_128:
335
336	;check if there is enough buffer to be able to fold 16B at a time
337	cmp arg3, 32
338	jl _less_than_32
339	movdqa xmm7, [SHUF_MASK]
340
341	;if there is, load the constants
342	movdqa xmm6, [rk1] ;k1
343
344	movd xmm0, arg1_low32
345	pslldq xmm0, 12
346	movdqu xmm3, [arg2]
347	pshufb xmm3, xmm7
348	pxor xmm3, xmm0
349
350
351	;update the buffer pointer
352	add arg2, 16
353
354	;update the counter. subtract 32 instead of 16 to save one instruction from the loop
355	sub arg3, 32
356
357	jmp _16B_reduction_loop
358
359
360	align 16
361	_less_than_32:
362	mov eax, arg1_low32
363	test arg3, arg3
364	je _cleanup
365
366	movdqa xmm7, [SHUF_MASK]
367
368	movd xmm0, arg1_low32
369	pslldq xmm0, 12
370
371	cmp arg3, 16
372	je _exact_16_left
373	jl _less_than_16_left
374	movd xmm0, arg1_low32
375	pslldq xmm0, 12
376	movdqu xmm3, [arg2]
377	pshufb xmm3, xmm7
378	pxor xmm3, xmm0
379	add arg2, 16
380	sub arg3, 16
381	movdqa xmm6, [rk1] ;k1
382	jmp _get_last_two_xmms
383
384
385	align 16
386	_less_than_16_left:
387	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
388
389	pxor xmm1, xmm1
390	mov r11, rsp
391	movdqa [r11], xmm1
392
393
394
395	cmp arg3, 4
396	jl _only_less_than_4
397
398	mov r9, arg3
399
400
401	cmp arg3, 8
402	jl _less_than_8_left
403	mov rax, [arg2]
404	mov [r11], rax
405	add r11, 8
406	sub arg3, 8
407	add arg2, 8
408	_less_than_8_left:
409
410	cmp arg3, 4
411	jl _less_than_4_left
412	mov eax, [arg2]
413	mov [r11], eax
414	add r11, 4
415	sub arg3, 4
416	add arg2, 4
417	_less_than_4_left:
418
419	cmp arg3, 2
420	jl _less_than_2_left
421	mov ax, [arg2]
422	mov [r11], ax
423	add r11, 2
424	sub arg3, 2
425	add arg2, 2
426	_less_than_2_left:
427	cmp arg3, 1
428	jl _zero_left
429
430	mov al, [arg2]
431	mov [r11], al
432
433	_zero_left:
434	movdqa xmm3, [rsp]
435	pshufb xmm3, xmm7
436	pxor xmm3, xmm0
437
438	shl r9, 4
439	lea rax, [pshufb_shf_table + 15*16]
440	sub rax, r9
441	movdqu xmm0, [rax]
442	pxor xmm0, [mask3]
443
444	pshufb xmm3, xmm0
445	jmp _128_done
446
447	align 16
448	_exact_16_left:
449	movdqu xmm3, [arg2]
450	pshufb xmm3, xmm7
451	pxor xmm3, xmm0
452
453	jmp _128_done
454
455	_only_less_than_4:
456	cmp arg3, 3
457	jl _only_less_than_3
458	mov al, [arg2]
459	mov [r11], al
460
461	mov al, [arg2+1]
462	mov [r11+1], al
463
464	mov al, [arg2+2]
465	mov [r11+2], al
466
467	movdqa xmm3, [rsp]
468	pshufb xmm3, xmm7
469	pxor xmm3, xmm0
470
471	psrldq xmm3, 5
472
473	jmp _barrett
474	_only_less_than_3:
475	cmp arg3, 2
476	jl _only_less_than_2
477	mov al, [arg2]
478	mov [r11], al
479
480	mov al, [arg2+1]
481	mov [r11+1], al
482
483	movdqa xmm3, [rsp]
484	pshufb xmm3, xmm7
485	pxor xmm3, xmm0
486
487	psrldq xmm3, 6
488
489	jmp _barrett
490	_only_less_than_2:
491	mov al, [arg2]
492	mov [r11], al
493
494	movdqa xmm3, [rsp]
495	pshufb xmm3, xmm7
496	pxor xmm3, xmm0
497
498	psrldq xmm3, 7
499
500	jmp _barrett
501	; precomputed constants
502	section .data
503
504	align 16
505	rk1:
506	DQ 0xf200aa6600000000
507	rk2:
508	DQ 0x17d3315d00000000
509	rk3:
510	DQ 0xd3504ec700000000
511	rk4:
512	DQ 0x57a8445500000000
513	rk5:
514	DQ 0xf200aa6600000000
515	rk6:
516	DQ 0x490d678d00000000
517	rk7:
518	DQ 0x0000000104d101df
519	rk8:
520	DQ 0x0000000104c11db7
521	mask:
522	dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
523	mask2:
524	dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
525	mask3:
526	dq 0x8080808080808080, 0x8080808080808080
527	mask4:
528	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
529	align 32
530	pshufb_shf_table:
531
532	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
533
534	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
535
536	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
537
538	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
539
540	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
541
542	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
543
544	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
545
546	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
547
548	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
549
550	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
551
552	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
553
554	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
555
556	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
557
558	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
559
560	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
561
562
563	SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
564
565	;;; func core, ver, snum
566	slversion crc32_ieee_by4, 05, 02, 0017