[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by4.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
;       Function API:
;       UINT16 crc16_t10dif_by4(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text
%ifidn __OUTPUT_FORMAT__, win64
	%xdefine        arg1 rcx
	%xdefine        arg2 rdx
	%xdefine        arg3 r8

	%xdefine        arg1_low32 ecx
%else
	%xdefine        arg1 rdi
	%xdefine        arg2 rsi
	%xdefine        arg3 rdx

	%xdefine        arg1_low32 edi
%endif

align 16
mk_global 	crc16_t10dif_by4, function
crc16_t10dif_by4:
	endbranch

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl	arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift
	; it right 16 bits, to scale back to 16 bits.

	sub	rsp,16*4+8

	; push the xmm registers into the stack to maintain
	movdqa [rsp+16*2],xmm6
	movdqa [rsp+16*3],xmm7

	; check if smaller than 128B
	cmp	arg3, 128

	; for sizes less than 128, we can't fold 64B at a time...
	jl	_less_than_128


	; load the initial crc value
	movd	xmm6, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to
	; be moved to the high part of the register.
	; because data will be byte-reflected and will align with
	; initial crc at correct place.
	pslldq	xmm6, 12

	movdqa xmm7, [SHUF_MASK]
	; receive the initial 64B data, xor the initial crc value
	movdqu	xmm0, [arg2]
	movdqu	xmm1, [arg2+16]
	movdqu	xmm2, [arg2+32]
	movdqu	xmm3, [arg2+48]

	pshufb	xmm0, xmm7
	; XOR the initial_crc value
	pxor	xmm0, xmm6
	pshufb	xmm1, xmm7
	pshufb	xmm2, xmm7
	pshufb	xmm3, xmm7

	movdqa	xmm6, [rk3]	;xmm6 has rk3 and rk4
					;imm value of pclmulqdq instruction
					;will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 128 instead of 64 to save one instruction from the loop
	sub	arg3, 128

	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	; buffer. The _fold_64_B_loop
	; loop will fold 64B at a time until we have 64+y Bytes of buffer


	; fold 64B at a time. This section of the code folds 4 xmm
	; registers in parallel
_fold_64_B_loop:

	; update the buffer pointer
	add	arg2, 64		;    buf += 64;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm4, xmm0
	movdqu	xmm5, xmm1

	pclmulqdq	xmm0, xmm6 , 0x11
	pclmulqdq	xmm1, xmm6 , 0x11

	pclmulqdq	xmm4, xmm6, 0x0
	pclmulqdq	xmm5, xmm6, 0x0

	pxor	xmm0, xmm4
   	pxor	xmm1, xmm5

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm4, xmm2
	movdqu	xmm5, xmm3

	pclmulqdq	xmm2, xmm6, 0x11
	pclmulqdq	xmm3, xmm6, 0x11

	pclmulqdq	xmm4, xmm6, 0x0
	pclmulqdq	xmm5, xmm6, 0x0

	pxor	xmm2, xmm4
	pxor	xmm3, xmm5

	movdqu	xmm4, [arg2]
	movdqu	xmm5, [arg2+16]
	pshufb	xmm4, xmm7
	pshufb	xmm5, xmm7
	pxor	xmm0, xmm4
	pxor	xmm1, xmm5

	movdqu	xmm4, [arg2+32]
	movdqu	xmm5, [arg2+48]
	pshufb	xmm4, xmm7
	pshufb	xmm5, xmm7

	pxor	xmm2, xmm4
	pxor	xmm3, xmm5

	sub	arg3, 64

	; check if there is another 64B in the buffer to be able to fold
	jge	_fold_64_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 64
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	; fold the 4 xmm registers to 1 xmm register with different constants

	movdqa	xmm6, [rk1]	;xmm6 has rk1 and rk2
					;imm value of pclmulqdq instruction will
					;determine which constant to use

	movdqa	xmm4, xmm0
	pclmulqdq	xmm0, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm1, xmm4
	pxor	xmm1, xmm0

	movdqa	xmm4, xmm1
	pclmulqdq	xmm1, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm2, xmm4
	pxor	xmm2, xmm1

	movdqa	xmm4, xmm2
	pclmulqdq	xmm2, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	pxor	xmm3, xmm2


	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 64-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes
	; is in register xmm3 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm4, xmm3
	pclmulqdq	xmm3, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm7
	pxor	xmm3, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm3 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer,
	; we can offset the input pointer before the actual point,
	; to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm3

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm7

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm3 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm3, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm4, xmm3
	pclmulqdq	xmm3, xmm6, 0x11
	pclmulqdq	xmm4, xmm6, 0x0
	pxor	xmm3, xmm4
	pxor	xmm3, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm6, [rk5]	; rk5 and rk6 in xmm6
	movdqa	xmm0, xmm3

	;64b fold
	pclmulqdq	xmm3, xmm6, 0x1
	pslldq	xmm0, 8
	pxor	xmm3, xmm0

	;32b fold
	movdqa	xmm0, xmm3

	pand	xmm0, [mask2]

	psrldq	xmm3, 12
	pclmulqdq	xmm3, xmm6, 0x10
	pxor	xmm3, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm6, [rk7]	; rk7 and rk8 in xmm6
	movdqa	xmm0, xmm3
	pclmulqdq	xmm3, xmm6, 0x01
	pslldq	xmm3, 4
	pclmulqdq	xmm3, xmm6, 0x11

	pslldq	xmm3, 4
	pxor	xmm3, xmm0
	pextrd	eax, xmm3,1

_cleanup:
	; scale the result back to 16 bits
	shr	eax, 16
	movdqa	xmm6, [rsp+16*2]
	movdqa	xmm7, [rsp+16*3]
	add	rsp,16*4+8
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_128:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm7, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place
	movdqu	xmm3, [arg2]	; load the plaintext
	pshufb	xmm3, xmm7	; byte-reflect the plaintext
	pxor	xmm3, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	movdqa xmm7, [SHUF_MASK]

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm3, [arg2]	; load the plaintext
	pshufb	xmm3, xmm7	; byte-reflect the plaintext
	pxor	xmm3, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp	arg3, 1
	jl	_zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	; shl r9, 4
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm3, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	movdqu	xmm3, [arg2]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	movdqa	xmm3, [rsp]
	pshufb	xmm3, xmm7
	pxor	xmm3, xmm0	; xor the initial crc value

	psrldq	xmm3, 7

	jmp	_barrett

section .data

; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x044c000000000000
rk4:
DQ 0xe658000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000
mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

;;;       func             core, ver, snum
slversion crc16_t10dif_by4, 05,   02,  0016
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29	;
	30	; Function API:
	31	; UINT16 crc16_t10dif_by4(
	32	; UINT16 init_crc, //initial CRC value, 16 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://download.intel.com/design/intarch/papers/323102.pdf
	44	;
	45
	46	%include "reg_sizes.asm"
	47
224ce89b WB	48	%define fetch_dist 1024
224ce89b WB	49
7c673cae FG	50	[bits 64]
	51	default rel
	52
	53	section .text
	54	%ifidn __OUTPUT_FORMAT__, win64
	55	%xdefine arg1 rcx
	56	%xdefine arg2 rdx
	57	%xdefine arg3 r8
	58
	59	%xdefine arg1_low32 ecx
	60	%else
	61	%xdefine arg1 rdi
	62	%xdefine arg2 rsi
	63	%xdefine arg3 rdx
	64
	65	%xdefine arg1_low32 edi
	66	%endif
	67
	68	align 16
20effc67	69	mk_global crc16_t10dif_by4, function
7c673cae	70	crc16_t10dif_by4:
20effc67	71	endbranch
7c673cae FG	72
	73	; adjust the 16-bit initial_crc value, scale it to 32 bits
	74	shl arg1_low32, 16
	75
	76	; After this point, code flow is exactly same as a 32-bit CRC.
	77	; The only difference is before returning eax, we will shift
	78	; it right 16 bits, to scale back to 16 bits.
	79
	80	sub rsp,16*4+8
	81
	82	; push the xmm registers into the stack to maintain
	83	movdqa [rsp+16*2],xmm6
	84	movdqa [rsp+16*3],xmm7
	85
	86	; check if smaller than 128B
	87	cmp arg3, 128
	88
	89	; for sizes less than 128, we can't fold 64B at a time...
	90	jl _less_than_128
	91
	92
	93	; load the initial crc value
	94	movd xmm6, arg1_low32 ; initial crc
	95
	96	; crc value does not need to be byte-reflected, but it needs to
	97	; be moved to the high part of the register.
	98	; because data will be byte-reflected and will align with
	99	; initial crc at correct place.
	100	pslldq xmm6, 12
	101
	102	movdqa xmm7, [SHUF_MASK]
	103	; receive the initial 64B data, xor the initial crc value
	104	movdqu xmm0, [arg2]
	105	movdqu xmm1, [arg2+16]
	106	movdqu xmm2, [arg2+32]
	107	movdqu xmm3, [arg2+48]
	108
	109	pshufb xmm0, xmm7
	110	; XOR the initial_crc value
	111	pxor xmm0, xmm6
	112	pshufb xmm1, xmm7
	113	pshufb xmm2, xmm7
	114	pshufb xmm3, xmm7
	115
	116	movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
	117	;imm value of pclmulqdq instruction
	118	;will determine which constant to use
	119	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	120	; we subtract 128 instead of 64 to save one instruction from the loop
	121	sub arg3, 128
	122
	123	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
	124	; buffer. The _fold_64_B_loop
	125	; loop will fold 64B at a time until we have 64+y Bytes of buffer
	126
	127
	128	; fold 64B at a time. This section of the code folds 4 xmm
	129	; registers in parallel
	130	_fold_64_B_loop:
	131
	132	; update the buffer pointer
	133	add arg2, 64 ; buf += 64;
	134
224ce89b	135	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	136	movdqu xmm4, xmm0
	137	movdqu xmm5, xmm1
	138
	139	pclmulqdq xmm0, xmm6 , 0x11
	140	pclmulqdq xmm1, xmm6 , 0x11
	141
	142	pclmulqdq xmm4, xmm6, 0x0
	143	pclmulqdq xmm5, xmm6, 0x0
	144
	145	pxor xmm0, xmm4
	146	pxor xmm1, xmm5
	147
224ce89b	148	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	149	movdqu xmm4, xmm2
	150	movdqu xmm5, xmm3
	151
	152	pclmulqdq xmm2, xmm6, 0x11
	153	pclmulqdq xmm3, xmm6, 0x11
	154
	155	pclmulqdq xmm4, xmm6, 0x0
	156	pclmulqdq xmm5, xmm6, 0x0
	157
	158	pxor xmm2, xmm4
	159	pxor xmm3, xmm5
	160
	161	movdqu xmm4, [arg2]
	162	movdqu xmm5, [arg2+16]
	163	pshufb xmm4, xmm7
	164	pshufb xmm5, xmm7
	165	pxor xmm0, xmm4
	166	pxor xmm1, xmm5
	167
	168	movdqu xmm4, [arg2+32]
	169	movdqu xmm5, [arg2+48]
	170	pshufb xmm4, xmm7
	171	pshufb xmm5, xmm7
	172
	173	pxor xmm2, xmm4
	174	pxor xmm3, xmm5
	175
	176	sub arg3, 64
	177
	178	; check if there is another 64B in the buffer to be able to fold
	179	jge _fold_64_B_loop
	180	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	181
	182
	183	add arg2, 64
	184	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	185	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
	186
	187
	188	; fold the 4 xmm registers to 1 xmm register with different constants
	189
	190	movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
	191	;imm value of pclmulqdq instruction will
	192	;determine which constant to use
	193
	194	movdqa xmm4, xmm0
	195	pclmulqdq xmm0, xmm6, 0x11
	196	pclmulqdq xmm4, xmm6, 0x0
	197	pxor xmm1, xmm4
	198	pxor xmm1, xmm0
	199
	200	movdqa xmm4, xmm1
	201	pclmulqdq xmm1, xmm6, 0x11
	202	pclmulqdq xmm4, xmm6, 0x0
	203	pxor xmm2, xmm4
	204	pxor xmm2, xmm1
	205
	206	movdqa xmm4, xmm2
	207	pclmulqdq xmm2, xmm6, 0x11
	208	pclmulqdq xmm4, xmm6, 0x0
	209	pxor xmm3, xmm4
	210	pxor xmm3, xmm2
	211
	212
213	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
214	; instead of a cmp instruction, we use the negative flag with the jl instruction
215	add arg3, 64-16
216	jl _final_reduction_for_128
217
218	; now we have 16+y bytes left to reduce. 16 Bytes
219	; is in register xmm3 and the rest is in memory
220	; we can fold 16 bytes at a time if y>=16
221	; continue folding 16B at a time
222
223	_16B_reduction_loop:
224	movdqa xmm4, xmm3
225	pclmulqdq xmm3, xmm6, 0x11
226	pclmulqdq xmm4, xmm6, 0x0
227	pxor xmm3, xmm4
228	movdqu xmm0, [arg2]
229	pshufb xmm0, xmm7
230	pxor xmm3, xmm0
231	add arg2, 16
232	sub arg3, 16
233	; instead of a cmp instruction, we utilize the flags with the jge instruction
234	; equivalent of: cmp arg3, 16-16
235	; check if there is any more 16B in the buffer to be able to fold
236	jge _16B_reduction_loop
237
238	;now we have 16+z bytes left to reduce, where 0<= z < 16.
239	;first, we reduce the data in the xmm3 register
240
241
242	_final_reduction_for_128:
243	; check if any more data to fold. If not, compute the CRC of the final 128 bits
244	add arg3, 16
245	je _128_done
246
247	; here we are getting data that is less than 16 bytes.
248	; since we know that there was data before the pointer,
249	; we can offset the input pointer before the actual point,
250	; to receive exactly 16 bytes.
251	; after that the registers need to be adjusted.
252	_get_last_two_xmms:
253	movdqa xmm2, xmm3
254
255	movdqu xmm1, [arg2 - 16 + arg3]
256	pshufb xmm1, xmm7
257
258	; get rid of the extra data that was loaded before
259	; load the shift constant
260	lea rax, [pshufb_shf_table + 16]
261	sub rax, arg3
262	movdqu xmm0, [rax]
263
264	; shift xmm2 to the left by arg3 bytes
265	pshufb xmm2, xmm0
266
267	; shift xmm3 to the right by 16-arg3 bytes
268	pxor xmm0, [mask1]
269	pshufb xmm3, xmm0
270	pblendvb xmm1, xmm2 ;xmm0 is implicit
271
272	; fold 16 Bytes
273	movdqa xmm2, xmm1
274	movdqa xmm4, xmm3
275	pclmulqdq xmm3, xmm6, 0x11
276	pclmulqdq xmm4, xmm6, 0x0
277	pxor xmm3, xmm4
278	pxor xmm3, xmm2
279
280	_128_done:
281	; compute crc of a 128-bit value
282	movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
283	movdqa xmm0, xmm3
284
285	;64b fold
286	pclmulqdq xmm3, xmm6, 0x1
287	pslldq xmm0, 8
288	pxor xmm3, xmm0
289
290	;32b fold
291	movdqa xmm0, xmm3
292
293	pand xmm0, [mask2]
294
295	psrldq xmm3, 12
296	pclmulqdq xmm3, xmm6, 0x10
297	pxor xmm3, xmm0
298
299	;barrett reduction
300	_barrett:
301	movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
302	movdqa xmm0, xmm3
303	pclmulqdq xmm3, xmm6, 0x01
304	pslldq xmm3, 4
305	pclmulqdq xmm3, xmm6, 0x11
306
307	pslldq xmm3, 4
308	pxor xmm3, xmm0
309	pextrd eax, xmm3,1
310
311	_cleanup:
312	; scale the result back to 16 bits
313	shr eax, 16
314	movdqa xmm6, [rsp+16*2]
315	movdqa xmm7, [rsp+16*3]
316	add rsp,16*4+8
317	ret
318
319
320	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
324
325	align 16
326	_less_than_128:
327
328	; check if there is enough buffer to be able to fold 16B at a time
329	cmp arg3, 32
330	jl _less_than_32
331	movdqa xmm7, [SHUF_MASK]
332
333	; if there is, load the constants
334	movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
335
336	movd xmm0, arg1_low32 ; get the initial crc value
337	pslldq xmm0, 12 ; align it to its correct place
338	movdqu xmm3, [arg2] ; load the plaintext
339	pshufb xmm3, xmm7 ; byte-reflect the plaintext
340	pxor xmm3, xmm0
341
342
343	; update the buffer pointer
344	add arg2, 16
345
346	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
347	sub arg3, 32
348
349	jmp _16B_reduction_loop
350
351
352	align 16
353	_less_than_32:
354	; mov initial crc to the return value. this is necessary for zero-length buffers.
355	mov eax, arg1_low32
356	test arg3, arg3
357	je _cleanup
358
359	movdqa xmm7, [SHUF_MASK]
360
361	movd xmm0, arg1_low32 ; get the initial crc value
362	pslldq xmm0, 12 ; align it to its correct place
363
364	cmp arg3, 16
365	je _exact_16_left
366	jl _less_than_16_left
367
368	movdqu xmm3, [arg2] ; load the plaintext
369	pshufb xmm3, xmm7 ; byte-reflect the plaintext
370	pxor xmm3, xmm0 ; xor the initial crc value
371	add arg2, 16
372	sub arg3, 16
373	movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
374	jmp _get_last_two_xmms
375
376
377	align 16
378	_less_than_16_left:
379	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
380
381	pxor xmm1, xmm1
382	mov r11, rsp
383	movdqa [r11], xmm1
384
385	cmp arg3, 4
386	jl _only_less_than_4
387
388	; backup the counter value
389	mov r9, arg3
390	cmp arg3, 8
391	jl _less_than_8_left
392
393	; load 8 Bytes
394	mov rax, [arg2]
395	mov [r11], rax
396	add r11, 8
397	sub arg3, 8
398	add arg2, 8
399	_less_than_8_left:
400
401	cmp arg3, 4
402	jl _less_than_4_left
403
404	; load 4 Bytes
405	mov eax, [arg2]
406	mov [r11], eax
407	add r11, 4
408	sub arg3, 4
409	add arg2, 4
410	_less_than_4_left:
411
412	cmp arg3, 2
413	jl _less_than_2_left
414
415	; load 2 Bytes
416	mov ax, [arg2]
417	mov [r11], ax
418	add r11, 2
419	sub arg3, 2
420	add arg2, 2
421	_less_than_2_left:
422	cmp arg3, 1
423	jl _zero_left
424
425	; load 1 Byte
426	mov al, [arg2]
427	mov [r11], al
428	_zero_left:
429	movdqa xmm3, [rsp]
430	pshufb xmm3, xmm7
431	pxor xmm3, xmm0 ; xor the initial crc value
432
433	; shl r9, 4
434	lea rax, [pshufb_shf_table + 16]
435	sub rax, r9
436	movdqu xmm0, [rax]
437	pxor xmm0, [mask1]
438
439	pshufb xmm3, xmm0
440	jmp _128_done
441
442	align 16
443	_exact_16_left:
444	movdqu xmm3, [arg2]
445	pshufb xmm3, xmm7
446	pxor xmm3, xmm0 ; xor the initial crc value
447
448	jmp _128_done
449
450	_only_less_than_4:
451	cmp arg3, 3
452	jl _only_less_than_3
453
454	; load 3 Bytes
455	mov al, [arg2]
456	mov [r11], al
457
458	mov al, [arg2+1]
459	mov [r11+1], al
460
461	mov al, [arg2+2]
462	mov [r11+2], al
463
464	movdqa xmm3, [rsp]
465	pshufb xmm3, xmm7
466	pxor xmm3, xmm0 ; xor the initial crc value
467
468	psrldq xmm3, 5
469
470	jmp _barrett
471	_only_less_than_3:
472	cmp arg3, 2
473	jl _only_less_than_2
474
475	; load 2 Bytes
476	mov al, [arg2]
477	mov [r11], al
478
479	mov al, [arg2+1]
480	mov [r11+1], al
481
482	movdqa xmm3, [rsp]
483	pshufb xmm3, xmm7
484	pxor xmm3, xmm0 ; xor the initial crc value
485
486	psrldq xmm3, 6
487
488	jmp _barrett
489	_only_less_than_2:
490
491	; load 1 Byte
492	mov al, [arg2]
493	mov [r11], al
494
495	movdqa xmm3, [rsp]
496	pshufb xmm3, xmm7
497	pxor xmm3, xmm0 ; xor the initial crc value
498
499	psrldq xmm3, 7
500
501	jmp _barrett
502
503	section .data
504
505	; precomputed constants
506	; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
507	align 16
508	; Q = 0x18BB70000
509	; rk1 = 2^(32*3) mod Q << 32
510	; rk2 = 2^(32*5) mod Q << 32
511	; rk3 = 2^(32*15) mod Q << 32
512	; rk4 = 2^(32*17) mod Q << 32
513	; rk5 = 2^(32*3) mod Q << 32
514	; rk6 = 2^(32*2) mod Q << 32
515	; rk7 = floor(2^64/Q)
516	; rk8 = Q
517	rk1:
518	DQ 0x2d56000000000000
519	rk2:
520	DQ 0x06df000000000000
521	rk3:
522	DQ 0x044c000000000000
523	rk4:
524	DQ 0xe658000000000000
525	rk5:
526	DQ 0x2d56000000000000
527	rk6:
528	DQ 0x1368000000000000
529	rk7:
530	DQ 0x00000001f65a57f8
531	rk8:
532	DQ 0x000000018bb70000
533	mask1:
534	dq 0x8080808080808080, 0x8080808080808080
535	mask2:
536	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
537
538	SHUF_MASK:
539	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
540
541	pshufb_shf_table:
542	; use these values for shift constants for the pshufb instruction
543	; different alignments result in values as shown:
544	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
545	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
546	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
547	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
548	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
549	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
550	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
551	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
552	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
553	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
554	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
555	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
556	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
557	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
558	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
559	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
560	dq 0x0706050403020100, 0x000e0d0c0b0a0908
561
562	;;; func core, ver, snum
563	slversion crc16_t10dif_by4, 05, 02, 0016