[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by4.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;       Function API:
;       UINT16 crc16_t10dif_by4(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text
%ifidn __OUTPUT_FORMAT__, win64
	%xdefine        arg1 rcx
	%xdefine        arg2 rdx
	%xdefine        arg3 r8

	%xdefine        arg1_low32 ecx
%else
	%xdefine        arg1 rdi
	%xdefine        arg2 rsi
	%xdefine        arg3 rdx

	%xdefine        arg1_low32 edi
%endif

align 16
global	crc16_t10dif_by4:ISAL_SYM_TYPE_FUNCTION
crc16_t10dif_by4:

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl	arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift
	; it right 16 bits, to scale back to 16 bits.

	sub	rsp,16*4+8

	; push the xmm registers into the stack to maintain
	movdqa [rsp+16*2],xmm6
	movdqa [rsp+16*3],xmm7

	; check if smaller than 128B
	cmp	arg3, 128

	; for sizes less than 128, we can't fold 64B at a time...
	jl	_less_than_128


	; load the initial crc value
	movd	xmm6, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to
	; be moved to the high part of the register.
	; because data will be byte-reflected and will align with
	; initial crc at correct place.
	pslldq	xmm6, 12

	movdqa xmm7, [SHUF_MASK]
	; receive the initial 64B data, xor the initial crc value
	movdqu	xmm0, [arg2]
	movdqu	xmm1, [arg2+16]
	movdqu	xmm2, [arg2+32]
	movdqu	xmm3, [arg2+48]

	pshufb	xmm0, xmm7
	; XOR the initial_crc value
	pxor	xmm0, xmm6
	pshufb	xmm1, xmm7
	pshufb	xmm2, xmm7
	pshufb	xmm3, xmm7

	movdqa	xmm6, [rk3]	;xmm6 has rk3 and rk4
					;imm value of pclmulqdq instruction
					;will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 128 instead of 64 to save one instruction from the loop
	sub	arg3, 128

	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	; buffer. The _fold_64_B_loop
	; loop will fold 64B at a time until we have 64+y Bytes of buffer


	; fold 64B at a time. This section of the code folds 4 xmm
	; registers in parallel
_fold_64_B_loop:

	; update the buffer pointer
	add	arg2, 64		;    buf += 64;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm4, xmm0
	movdqu	xmm5, xmm1

	pclmulqdq	xmm0, xmm6 , 0x11
	pclmulqdq	xmm1, xmm6 , 0x11

	pclmulqdq	xmm4, xmm6, 0x0
	pclmulqdq	xmm5, xmm6, 0x0

	pxor	xmm0, xmm4
   	pxor	xmm1, xmm5

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm4, xmm2
	movdqu	xmm5, xmm3

	pclmulqdq	xmm2, xmm6, 0x11
	pclmulqdq	xmm3, xmm6, 0x11

	pclmulqdq	xmm4, xmm6, 0x0
	pclmulqdq	xmm5, xmm6, 0x0

	pxor	xmm2, xmm4
	pxor	xmm3, xmm5

	movdqu	xmm4, [arg2]
	movdqu	xmm5, [arg2+16]
	pshufb	xmm4, xmm7
	pshufb	xmm5, xmm7
	pxor	xmm0, xmm4
	pxor	xmm1, xmm5

	movdqu	xmm4, [arg2+32]
	movdqu	xmm5, [arg2+48]
	pshufb	xmm4, xmm7
	pshufb	xmm5, xmm7

	pxor	xmm2, xmm4
	pxor	xmm3, xmm5

	sub	arg3, 64

	; check if there is another 64B in the buffer to be able to fold
	jge	_fold_64_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 64
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	; fold the 4 xmm registers to 1 xmm register with different constants

	movdqa	xmm6, [rk1]	;xmm6 has rk1 and rk2
					;imm value of pclmulqdq instruction will
					;determine which constant to use

	movdqa	xmm4, xmm0
	pclmulqdq	xmm0, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm1, xmm4
	pxor	xmm1, xmm0

	movdqa	xmm4, xmm1
	pclmulqdq	xmm1, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm2, xmm4
	pxor	xmm2, xmm1

	movdqa	xmm4, xmm2
	pclmulqdq	xmm2, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	pxor	xmm3, xmm2


	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 64-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes
	; is in register xmm3 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm4, xmm3
	pclmulqdq	xmm3, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm7
	pxor	xmm3, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm3 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer,
	; we can offset the input pointer before the actual point,
	; to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm3

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm7

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm3 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm3, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm4, xmm3
	pclmulqdq	xmm3, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	pxor	xmm3, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm6, [rk5]	; rk5 and rk6 in xmm6
	movdqa	xmm0, xmm3

	;64b fold
	pclmulqdq	xmm3, xmm6, 0x1
	pslldq	xmm0, 8
	pxor	xmm3, xmm0

	;32b fold
	movdqa	xmm0, xmm3

	pand	xmm0, [mask2]

	psrldq	xmm3, 12
	pclmulqdq	xmm3, xmm6, 0x10
	pxor	xmm3, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm6, [rk7]	; rk7 and rk8 in xmm6
	movdqa	xmm0, xmm3
	pclmulqdq	xmm3, xmm6, 0x01
	pslldq	xmm3, 4
	pclmulqdq	xmm3, xmm6, 0x11

	pslldq	xmm3, 4
	pxor	xmm3, xmm0
	pextrd	eax, xmm3,1

_cleanup:
	; scale the result back to 16 bits
	shr	eax, 16
	movdqa	xmm6, [rsp+16*2]
	movdqa	xmm7, [rsp+16*3]
	add	rsp,16*4+8
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_128:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm7, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place
	movdqu	xmm3, [arg2]	; load the plaintext
	pshufb	xmm3, xmm7	; byte-reflect the plaintext
	pxor	xmm3, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	movdqa xmm7, [SHUF_MASK]

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm3, [arg2]	; load the plaintext
	pshufb	xmm3, xmm7	; byte-reflect the plaintext
	pxor	xmm3, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp	arg3, 1
	jl	_zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	; shl r9, 4
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm3, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	movdqu	xmm3, [arg2]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 7

	jmp	_barrett

section .data

; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x044c000000000000
rk4:
DQ 0xe658000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000
mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

;;;       func             core, ver, snum
slversion crc16_t10dif_by4, 05,   02,  0016
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29	;
	30	; Function API:
	31	; UINT16 crc16_t10dif_by4(
	32	; UINT16 init_crc, //initial CRC value, 16 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://download.intel.com/design/intarch/papers/323102.pdf
	44	;
	45
	46	%include "reg_sizes.asm"
	47
224ce89b WB	48	%define fetch_dist 1024
224ce89b WB	49
7c673cae FG	50	[bits 64]
	51	default rel
	52
	53	section .text
	54	%ifidn __OUTPUT_FORMAT__, win64
	55	%xdefine arg1 rcx
	56	%xdefine arg2 rdx
	57	%xdefine arg3 r8
	58
	59	%xdefine arg1_low32 ecx
	60	%else
	61	%xdefine arg1 rdi
	62	%xdefine arg2 rsi
	63	%xdefine arg3 rdx
	64
	65	%xdefine arg1_low32 edi
	66	%endif
	67
	68	align 16
f91f0fd5	69	global crc16_t10dif_by4:ISAL_SYM_TYPE_FUNCTION
7c673cae FG	70	crc16_t10dif_by4:
	71
	72	; adjust the 16-bit initial_crc value, scale it to 32 bits
	73	shl arg1_low32, 16
	74
	75	; After this point, code flow is exactly same as a 32-bit CRC.
	76	; The only difference is before returning eax, we will shift
	77	; it right 16 bits, to scale back to 16 bits.
	78
	79	sub rsp,16*4+8
	80
	81	; push the xmm registers into the stack to maintain
	82	movdqa [rsp+16*2],xmm6
	83	movdqa [rsp+16*3],xmm7
	84
	85	; check if smaller than 128B
	86	cmp arg3, 128
	87
	88	; for sizes less than 128, we can't fold 64B at a time...
	89	jl _less_than_128
	90
	91
	92	; load the initial crc value
	93	movd xmm6, arg1_low32 ; initial crc
	94
	95	; crc value does not need to be byte-reflected, but it needs to
	96	; be moved to the high part of the register.
	97	; because data will be byte-reflected and will align with
	98	; initial crc at correct place.
	99	pslldq xmm6, 12
	100
	101	movdqa xmm7, [SHUF_MASK]
	102	; receive the initial 64B data, xor the initial crc value
	103	movdqu xmm0, [arg2]
	104	movdqu xmm1, [arg2+16]
	105	movdqu xmm2, [arg2+32]
	106	movdqu xmm3, [arg2+48]
	107
	108	pshufb xmm0, xmm7
	109	; XOR the initial_crc value
	110	pxor xmm0, xmm6
	111	pshufb xmm1, xmm7
	112	pshufb xmm2, xmm7
	113	pshufb xmm3, xmm7
	114
	115	movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
	116	;imm value of pclmulqdq instruction
	117	;will determine which constant to use
	118	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	119	; we subtract 128 instead of 64 to save one instruction from the loop
	120	sub arg3, 128
	121
	122	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	123	; buffer. The _fold_64_B_loop
	124	; loop will fold 64B at a time until we have 64+y Bytes of buffer
	125
	126
	127	; fold 64B at a time. This section of the code folds 4 xmm
	128	; registers in parallel
	129	_fold_64_B_loop:
	130
	131	; update the buffer pointer
	132	add arg2, 64 ; buf += 64;
	133
224ce89b	134	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	135	movdqu xmm4, xmm0
	136	movdqu xmm5, xmm1
	137
	138	pclmulqdq xmm0, xmm6 , 0x11
	139	pclmulqdq xmm1, xmm6 , 0x11
	140
	141	pclmulqdq xmm4, xmm6, 0x0
	142	pclmulqdq xmm5, xmm6, 0x0
	143
	144	pxor xmm0, xmm4
	145	pxor xmm1, xmm5
	146
224ce89b	147	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	148	movdqu xmm4, xmm2
	149	movdqu xmm5, xmm3
	150
	151	pclmulqdq xmm2, xmm6, 0x11
	152	pclmulqdq xmm3, xmm6, 0x11
	153
	154	pclmulqdq xmm4, xmm6, 0x0
	155	pclmulqdq xmm5, xmm6, 0x0
	156
	157	pxor xmm2, xmm4
	158	pxor xmm3, xmm5
	159
	160	movdqu xmm4, [arg2]
	161	movdqu xmm5, [arg2+16]
	162	pshufb xmm4, xmm7
	163	pshufb xmm5, xmm7
	164	pxor xmm0, xmm4
	165	pxor xmm1, xmm5
	166
	167	movdqu xmm4, [arg2+32]
	168	movdqu xmm5, [arg2+48]
	169	pshufb xmm4, xmm7
	170	pshufb xmm5, xmm7
	171
	172	pxor xmm2, xmm4
	173	pxor xmm3, xmm5
	174
	175	sub arg3, 64
	176
	177	; check if there is another 64B in the buffer to be able to fold
	178	jge _fold_64_B_loop
	179	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	180
	181
	182	add arg2, 64
	183	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	184	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
	185
	186
	187	; fold the 4 xmm registers to 1 xmm register with different constants
	188
	189	movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
	190	;imm value of pclmulqdq instruction will
	191	;determine which constant to use
	192
	193	movdqa xmm4, xmm0
	194	pclmulqdq xmm0, xmm6, 0x11
	195	pclmulqdq xmm4, xmm6, 0x0
	196	pxor xmm1, xmm4
	197	pxor xmm1, xmm0
	198
	199	movdqa xmm4, xmm1
	200	pclmulqdq xmm1, xmm6, 0x11
	201	pclmulqdq xmm4, xmm6, 0x0
	202	pxor xmm2, xmm4
	203	pxor xmm2, xmm1
	204
	205	movdqa xmm4, xmm2
	206	pclmulqdq xmm2, xmm6, 0x11
	207	pclmulqdq xmm4, xmm6, 0x0
	208	pxor xmm3, xmm4
	209	pxor xmm3, xmm2
	210
	211
212	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
213	; instead of a cmp instruction, we use the negative flag with the jl instruction
214	add arg3, 64-16
215	jl _final_reduction_for_128
216
217	; now we have 16+y bytes left to reduce. 16 Bytes
218	; is in register xmm3 and the rest is in memory
219	; we can fold 16 bytes at a time if y>=16
220	; continue folding 16B at a time
221
222	_16B_reduction_loop:
223	movdqa xmm4, xmm3
224	pclmulqdq xmm3, xmm6, 0x11
225	pclmulqdq xmm4, xmm6, 0x0
226	pxor xmm3, xmm4
227	movdqu xmm0, [arg2]
228	pshufb xmm0, xmm7
229	pxor xmm3, xmm0
230	add arg2, 16
231	sub arg3, 16
232	; instead of a cmp instruction, we utilize the flags with the jge instruction
233	; equivalent of: cmp arg3, 16-16
234	; check if there is any more 16B in the buffer to be able to fold
235	jge _16B_reduction_loop
236
237	;now we have 16+z bytes left to reduce, where 0<= z < 16.
238	;first, we reduce the data in the xmm3 register
239
240
241	_final_reduction_for_128:
242	; check if any more data to fold. If not, compute the CRC of the final 128 bits
243	add arg3, 16
244	je _128_done
245
246	; here we are getting data that is less than 16 bytes.
247	; since we know that there was data before the pointer,
248	; we can offset the input pointer before the actual point,
249	; to receive exactly 16 bytes.
250	; after that the registers need to be adjusted.
251	_get_last_two_xmms:
252	movdqa xmm2, xmm3
253
254	movdqu xmm1, [arg2 - 16 + arg3]
255	pshufb xmm1, xmm7
256
257	; get rid of the extra data that was loaded before
258	; load the shift constant
259	lea rax, [pshufb_shf_table + 16]
260	sub rax, arg3
261	movdqu xmm0, [rax]
262
263	; shift xmm2 to the left by arg3 bytes
264	pshufb xmm2, xmm0
265
266	; shift xmm3 to the right by 16-arg3 bytes
267	pxor xmm0, [mask1]
268	pshufb xmm3, xmm0
269	pblendvb xmm1, xmm2 ;xmm0 is implicit
270
271	; fold 16 Bytes
272	movdqa xmm2, xmm1
273	movdqa xmm4, xmm3
274	pclmulqdq xmm3, xmm6, 0x11
275	pclmulqdq xmm4, xmm6, 0x0
276	pxor xmm3, xmm4
277	pxor xmm3, xmm2
278
279	_128_done:
280	; compute crc of a 128-bit value
281	movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
282	movdqa xmm0, xmm3
283
284	;64b fold
285	pclmulqdq xmm3, xmm6, 0x1
286	pslldq xmm0, 8
287	pxor xmm3, xmm0
288
289	;32b fold
290	movdqa xmm0, xmm3
291
292	pand xmm0, [mask2]
293
294	psrldq xmm3, 12
295	pclmulqdq xmm3, xmm6, 0x10
296	pxor xmm3, xmm0
297
298	;barrett reduction
299	_barrett:
300	movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
301	movdqa xmm0, xmm3
302	pclmulqdq xmm3, xmm6, 0x01
303	pslldq xmm3, 4
304	pclmulqdq xmm3, xmm6, 0x11
305
306	pslldq xmm3, 4
307	pxor xmm3, xmm0
308	pextrd eax, xmm3,1
309
310	_cleanup:
311	; scale the result back to 16 bits
312	shr eax, 16
313	movdqa xmm6, [rsp+16*2]
314	movdqa xmm7, [rsp+16*3]
315	add rsp,16*4+8
316	ret
317
318
319	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323
324	align 16
325	_less_than_128:
326
327	; check if there is enough buffer to be able to fold 16B at a time
328	cmp arg3, 32
329	jl _less_than_32
330	movdqa xmm7, [SHUF_MASK]
331
332	; if there is, load the constants
333	movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
334
335	movd xmm0, arg1_low32 ; get the initial crc value
336	pslldq xmm0, 12 ; align it to its correct place
337	movdqu xmm3, [arg2] ; load the plaintext
338	pshufb xmm3, xmm7 ; byte-reflect the plaintext
339	pxor xmm3, xmm0
340
341
342	; update the buffer pointer
343	add arg2, 16
344
345	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
346	sub arg3, 32
347
348	jmp _16B_reduction_loop
349
350
351	align 16
352	_less_than_32:
353	; mov initial crc to the return value. this is necessary for zero-length buffers.
354	mov eax, arg1_low32
355	test arg3, arg3
356	je _cleanup
357
358	movdqa xmm7, [SHUF_MASK]
359
360	movd xmm0, arg1_low32 ; get the initial crc value
361	pslldq xmm0, 12 ; align it to its correct place
362
363	cmp arg3, 16
364	je _exact_16_left
365	jl _less_than_16_left
366
367	movdqu xmm3, [arg2] ; load the plaintext
368	pshufb xmm3, xmm7 ; byte-reflect the plaintext
369	pxor xmm3, xmm0 ; xor the initial crc value
370	add arg2, 16
371	sub arg3, 16
372	movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
373	jmp _get_last_two_xmms
374
375
376	align 16
377	_less_than_16_left:
378	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
379
380	pxor xmm1, xmm1
381	mov r11, rsp
382	movdqa [r11], xmm1
383
384	cmp arg3, 4
385	jl _only_less_than_4
386
387	; backup the counter value
388	mov r9, arg3
389	cmp arg3, 8
390	jl _less_than_8_left
391
392	; load 8 Bytes
393	mov rax, [arg2]
394	mov [r11], rax
395	add r11, 8
396	sub arg3, 8
397	add arg2, 8
398	_less_than_8_left:
399
400	cmp arg3, 4
401	jl _less_than_4_left
402
403	; load 4 Bytes
404	mov eax, [arg2]
405	mov [r11], eax
406	add r11, 4
407	sub arg3, 4
408	add arg2, 4
409	_less_than_4_left:
410
411	cmp arg3, 2
412	jl _less_than_2_left
413
414	; load 2 Bytes
415	mov ax, [arg2]
416	mov [r11], ax
417	add r11, 2
418	sub arg3, 2
419	add arg2, 2
420	_less_than_2_left:
421	cmp arg3, 1
422	jl _zero_left
423
424	; load 1 Byte
425	mov al, [arg2]
426	mov [r11], al
427	_zero_left:
428	movdqa xmm3, [rsp]
429	pshufb xmm3, xmm7
430	pxor xmm3, xmm0 ; xor the initial crc value
431
432	; shl r9, 4
433	lea rax, [pshufb_shf_table + 16]
434	sub rax, r9
435	movdqu xmm0, [rax]
436	pxor xmm0, [mask1]
437
438	pshufb xmm3, xmm0
439	jmp _128_done
440
441	align 16
442	_exact_16_left:
443	movdqu xmm3, [arg2]
444	pshufb xmm3, xmm7
445	pxor xmm3, xmm0 ; xor the initial crc value
446
447	jmp _128_done
448
449	_only_less_than_4:
450	cmp arg3, 3
451	jl _only_less_than_3
452
453	; load 3 Bytes
454	mov al, [arg2]
455	mov [r11], al
456
457	mov al, [arg2+1]
458	mov [r11+1], al
459
460	mov al, [arg2+2]
461	mov [r11+2], al
462
463	movdqa xmm3, [rsp]
464	pshufb xmm3, xmm7
465	pxor xmm3, xmm0 ; xor the initial crc value
466
467	psrldq xmm3, 5
468
469	jmp _barrett
470	_only_less_than_3:
471	cmp arg3, 2
472	jl _only_less_than_2
473
474	; load 2 Bytes
475	mov al, [arg2]
476	mov [r11], al
477
478	mov al, [arg2+1]
479	mov [r11+1], al
480
481	movdqa xmm3, [rsp]
482	pshufb xmm3, xmm7
483	pxor xmm3, xmm0 ; xor the initial crc value
484
485	psrldq xmm3, 6
486
487	jmp _barrett
488	_only_less_than_2:
489
490	; load 1 Byte
491	mov al, [arg2]
492	mov [r11], al
493
494	movdqa xmm3, [rsp]
495	pshufb xmm3, xmm7
496	pxor xmm3, xmm0 ; xor the initial crc value
497
498	psrldq xmm3, 7
499
500	jmp _barrett
501
502	section .data
503
504	; precomputed constants
505	; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
506	align 16
507	; Q = 0x18BB70000
508	; rk1 = 2^(32*3) mod Q << 32
509	; rk2 = 2^(32*5) mod Q << 32
510	; rk3 = 2^(32*15) mod Q << 32
511	; rk4 = 2^(32*17) mod Q << 32
512	; rk5 = 2^(32*3) mod Q << 32
513	; rk6 = 2^(32*2) mod Q << 32
514	; rk7 = floor(2^64/Q)
515	; rk8 = Q
516	rk1:
517	DQ 0x2d56000000000000
518	rk2:
519	DQ 0x06df000000000000
520	rk3:
521	DQ 0x044c000000000000
522	rk4:
523	DQ 0xe658000000000000
524	rk5:
525	DQ 0x2d56000000000000
526	rk6:
527	DQ 0x1368000000000000
528	rk7:
529	DQ 0x00000001f65a57f8
530	rk8:
531	DQ 0x000000018bb70000
532	mask1:
533	dq 0x8080808080808080, 0x8080808080808080
534	mask2:
535	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
536
537	SHUF_MASK:
538	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
539
540	pshufb_shf_table:
541	; use these values for shift constants for the pshufb instruction
542	; different alignments result in values as shown:
543	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
544	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
545	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
546	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
547	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
548	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
549	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
550	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
551	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
552	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
553	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
554	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
555	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
556	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
557	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
558	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
559	dq 0x0706050403020100, 0x000e0d0c0b0a0908
560
561	;;; func core, ver, snum
562	slversion crc16_t10dif_by4, 05, 02, 0016