[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_01.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       UINT16 crc16_t10dif_01(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

%include "reg_sizes.asm"

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif

%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif

align 16
global	crc16_t10dif_01:function
crc16_t10dif_01:

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl	arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.

	sub	rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	movdqa [rsp+16*2],xmm6
	movdqa [rsp+16*3],xmm7
	movdqa [rsp+16*4],xmm8
	movdqa [rsp+16*5],xmm9
	movdqa [rsp+16*6],xmm10
	movdqa [rsp+16*7],xmm11
	movdqa [rsp+16*8],xmm12
	movdqa [rsp+16*9],xmm13
%endif

	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	movd	xmm10, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	pslldq	xmm10, 12

	movdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	movdqu	xmm0, [arg2+16*0]
	movdqu	xmm1, [arg2+16*1]
	movdqu	xmm2, [arg2+16*2]
	movdqu	xmm3, [arg2+16*3]
	movdqu	xmm4, [arg2+16*4]
	movdqu	xmm5, [arg2+16*5]
	movdqu	xmm6, [arg2+16*6]
	movdqu	xmm7, [arg2+16*7]

	pshufb	xmm0, xmm11
	; XOR the initial_crc value
	pxor	xmm0, xmm10
	pshufb	xmm1, xmm11
	pshufb	xmm2, xmm11
	pshufb	xmm3, xmm11
	pshufb	xmm4, xmm11
	pshufb	xmm5, xmm11
	pshufb	xmm6, xmm11
	pshufb	xmm7, xmm11

	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	movdqu	xmm9, [arg2+16*0]
	movdqu	xmm12, [arg2+16*1]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm0
	movdqa	xmm13, xmm1
	pclmulqdq	xmm0, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm1, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm0, xmm9
	xorps	xmm0, xmm8
	pxor	xmm1, xmm12
	xorps	xmm1, xmm13

	movdqu	xmm9, [arg2+16*2]
	movdqu	xmm12, [arg2+16*3]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm2
	movdqa	xmm13, xmm3
	pclmulqdq	xmm2, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm3, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm2, xmm9
	xorps	xmm2, xmm8
	pxor	xmm3, xmm12
	xorps	xmm3, xmm13

	movdqu	xmm9, [arg2+16*4]
	movdqu	xmm12, [arg2+16*5]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm4
	movdqa	xmm13, xmm5
	pclmulqdq	xmm4, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm5, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm4, xmm9
	xorps	xmm4, xmm8
	pxor	xmm5, xmm12
	xorps	xmm5, xmm13

	movdqu	xmm9, [arg2+16*6]
	movdqu	xmm12, [arg2+16*7]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm6
	movdqa	xmm13, xmm7
	pclmulqdq	xmm6, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm7, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm6, xmm9
	xorps	xmm6, xmm8
	pxor	xmm7, xmm12
	xorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; fold the 8 xmm registers to 1 xmm register with different constants

	movdqa	xmm10, [rk9]
	movdqa	xmm8, xmm0
	pclmulqdq	xmm0, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm0

	movdqa	xmm10, [rk11]
	movdqa	xmm8, xmm1
	pclmulqdq	xmm1, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm1

	movdqa	xmm10, [rk13]
	movdqa	xmm8, xmm2
	pclmulqdq	xmm2, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

	movdqa	xmm10, [rk15]
	movdqa	xmm8, xmm3
	pclmulqdq	xmm3, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm3

	movdqa	xmm10, [rk17]
	movdqa	xmm8, xmm4
	pclmulqdq	xmm4, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm4

	movdqa	xmm10, [rk19]
	movdqa	xmm8, xmm5
	pclmulqdq	xmm5, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm5

	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
				;imm value of pclmulqdq instruction will determine which constant to use
	movdqa	xmm8, xmm6
	pclmulqdq	xmm6, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm11
	pxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm7

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm7, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	movdqa	xmm0, xmm7

	;64b fold
	pclmulqdq	xmm7, xmm10, 0x1
	pslldq	xmm0, 8
	pxor	xmm7, xmm0

	;32b fold
	movdqa	xmm0, xmm7

	pand	xmm0, [mask2]

	psrldq	xmm7, 12
	pclmulqdq	xmm7, xmm10, 0x10
	pxor	xmm7, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	movdqa	xmm0, xmm7
	pclmulqdq	xmm7, xmm10, 0x01
	pslldq	xmm7, 4
	pclmulqdq	xmm7, xmm10, 0x11

	pslldq	xmm7, 4
	pxor	xmm7, xmm0
	pextrd	eax, xmm7,1

_cleanup:
	; scale the result back to 16 bits
	shr	eax, 16
%ifidn __OUTPUT_FORMAT__, win64
	movdqa	xmm6, [rsp+16*2]
	movdqa	xmm7, [rsp+16*3]
	movdqa	xmm8, [rsp+16*4]
	movdqa	xmm9, [rsp+16*5]
	movdqa	xmm10, [rsp+16*6]
	movdqa	xmm11, [rsp+16*7]
	movdqa	xmm12, [rsp+16*8]
	movdqa	xmm13, [rsp+16*9]
%endif
	add	rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place
	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	movdqa xmm11, [SHUF_MASK]

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm7, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	movdqu	xmm7, [arg2]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 7

	jmp	_barrett

section .data

; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x9d9d000000000000
rk4:
DQ 0x7cf5000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000

rk9:
DQ 0xceae000000000000
rk10:
DQ 0xbfd6000000000000
rk11:
DQ 0x1e16000000000000
rk12:
DQ 0x713c000000000000
rk13:
DQ 0xf7f9000000000000
rk14:
DQ 0x80a6000000000000
rk15:
DQ 0x044c000000000000
rk16:
DQ 0xe658000000000000
rk17:
DQ 0xad18000000000000
rk18:
DQ 0xa497000000000000
rk19:
DQ 0x6ee3000000000000
rk20:
DQ 0xe7b5000000000000


mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

;;;       func          core, ver, snum
slversion crc16_t10dif_01, 01,   06,  0010
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; UINT16 crc16_t10dif_01(
	32	; UINT16 init_crc, //initial CRC value, 16 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	44
	45	%include "reg_sizes.asm"
	46
	47	[bits 64]
	48	default rel
	49
	50	section .text
	51
	52	%ifidn __OUTPUT_FORMAT__, win64
	53	%xdefine arg1 rcx
	54	%xdefine arg2 rdx
	55	%xdefine arg3 r8
	56
	57	%xdefine arg1_low32 ecx
	58	%else
	59	%xdefine arg1 rdi
	60	%xdefine arg2 rsi
	61	%xdefine arg3 rdx
	62
	63	%xdefine arg1_low32 edi
	64	%endif
65
66	%ifidn __OUTPUT_FORMAT__, win64
67	%define XMM_SAVE 16*2
68	%define VARIABLE_OFFSET 16*10+8
69	%else
70	%define VARIABLE_OFFSET 16*2+8
71	%endif
72
73	align 16
74	global crc16_t10dif_01:function
75	crc16_t10dif_01:
76
77	; adjust the 16-bit initial_crc value, scale it to 32 bits
78	shl arg1_low32, 16
79
80	; After this point, code flow is exactly same as a 32-bit CRC.
81	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
82
83	sub rsp, VARIABLE_OFFSET
84	%ifidn __OUTPUT_FORMAT__, win64
85	; push the xmm registers into the stack to maintain
86	movdqa [rsp+16*2],xmm6
87	movdqa [rsp+16*3],xmm7
88	movdqa [rsp+16*4],xmm8
89	movdqa [rsp+16*5],xmm9
90	movdqa [rsp+16*6],xmm10
91	movdqa [rsp+16*7],xmm11
92	movdqa [rsp+16*8],xmm12
93	movdqa [rsp+16*9],xmm13
94	%endif
95
96	; check if smaller than 256
97	cmp arg3, 256
98
99	; for sizes less than 256, we can't fold 128B at a time...
100	jl _less_than_256
101
102
103	; load the initial crc value
104	movd xmm10, arg1_low32 ; initial crc
105
106	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
107	; because data will be byte-reflected and will align with initial crc at correct place.
108	pslldq xmm10, 12
109
110	movdqa xmm11, [SHUF_MASK]
111	; receive the initial 128B data, xor the initial crc value
112	movdqu xmm0, [arg2+16*0]
113	movdqu xmm1, [arg2+16*1]
114	movdqu xmm2, [arg2+16*2]
115	movdqu xmm3, [arg2+16*3]
116	movdqu xmm4, [arg2+16*4]
117	movdqu xmm5, [arg2+16*5]
118	movdqu xmm6, [arg2+16*6]
119	movdqu xmm7, [arg2+16*7]
120
121	pshufb xmm0, xmm11
122	; XOR the initial_crc value
123	pxor xmm0, xmm10
124	pshufb xmm1, xmm11
125	pshufb xmm2, xmm11
126	pshufb xmm3, xmm11
127	pshufb xmm4, xmm11
128	pshufb xmm5, xmm11
129	pshufb xmm6, xmm11
130	pshufb xmm7, xmm11
131
132	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
133	;imm value of pclmulqdq instruction will determine which constant to use
134	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
135	; we subtract 256 instead of 128 to save one instruction from the loop
136	sub arg3, 256
137
138	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
139	; loop will fold 128B at a time until we have 128+y Bytes of buffer
140
141
142	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
143	_fold_128_B_loop:
144
145	; update the buffer pointer
146	add arg2, 128 ; buf += 128;
147
148	movdqu xmm9, [arg2+16*0]
149	movdqu xmm12, [arg2+16*1]
150	pshufb xmm9, xmm11
151	pshufb xmm12, xmm11
152	movdqa xmm8, xmm0
153	movdqa xmm13, xmm1
154	pclmulqdq xmm0, xmm10, 0x0
155	pclmulqdq xmm8, xmm10 , 0x11
156	pclmulqdq xmm1, xmm10, 0x0
157	pclmulqdq xmm13, xmm10 , 0x11
158	pxor xmm0, xmm9
159	xorps xmm0, xmm8
160	pxor xmm1, xmm12
161	xorps xmm1, xmm13
162
163	movdqu xmm9, [arg2+16*2]
164	movdqu xmm12, [arg2+16*3]
165	pshufb xmm9, xmm11
166	pshufb xmm12, xmm11
167	movdqa xmm8, xmm2
168	movdqa xmm13, xmm3
169	pclmulqdq xmm2, xmm10, 0x0
170	pclmulqdq xmm8, xmm10 , 0x11
171	pclmulqdq xmm3, xmm10, 0x0
172	pclmulqdq xmm13, xmm10 , 0x11
173	pxor xmm2, xmm9
174	xorps xmm2, xmm8
175	pxor xmm3, xmm12
176	xorps xmm3, xmm13
177
178	movdqu xmm9, [arg2+16*4]
179	movdqu xmm12, [arg2+16*5]
180	pshufb xmm9, xmm11
181	pshufb xmm12, xmm11
182	movdqa xmm8, xmm4
183	movdqa xmm13, xmm5
184	pclmulqdq xmm4, xmm10, 0x0
185	pclmulqdq xmm8, xmm10 , 0x11
186	pclmulqdq xmm5, xmm10, 0x0
187	pclmulqdq xmm13, xmm10 , 0x11
188	pxor xmm4, xmm9
189	xorps xmm4, xmm8
190	pxor xmm5, xmm12
191	xorps xmm5, xmm13
192
193	movdqu xmm9, [arg2+16*6]
194	movdqu xmm12, [arg2+16*7]
195	pshufb xmm9, xmm11
196	pshufb xmm12, xmm11
197	movdqa xmm8, xmm6
198	movdqa xmm13, xmm7
199	pclmulqdq xmm6, xmm10, 0x0
200	pclmulqdq xmm8, xmm10 , 0x11
201	pclmulqdq xmm7, xmm10, 0x0
202	pclmulqdq xmm13, xmm10 , 0x11
203	pxor xmm6, xmm9
204	xorps xmm6, xmm8
205	pxor xmm7, xmm12
206	xorps xmm7, xmm13
207
208	sub arg3, 128
209
210	; check if there is another 128B in the buffer to be able to fold
211	jge _fold_128_B_loop
212	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
213
214
215	add arg2, 128
216	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
217	; fold the 8 xmm registers to 1 xmm register with different constants
218
219	movdqa xmm10, [rk9]
220	movdqa xmm8, xmm0
221	pclmulqdq xmm0, xmm10, 0x11
222	pclmulqdq xmm8, xmm10, 0x0
223	pxor xmm7, xmm8
224	xorps xmm7, xmm0
225
226	movdqa xmm10, [rk11]
227	movdqa xmm8, xmm1
228	pclmulqdq xmm1, xmm10, 0x11
229	pclmulqdq xmm8, xmm10, 0x0
230	pxor xmm7, xmm8
231	xorps xmm7, xmm1
232
233	movdqa xmm10, [rk13]
234	movdqa xmm8, xmm2
235	pclmulqdq xmm2, xmm10, 0x11
236	pclmulqdq xmm8, xmm10, 0x0
237	pxor xmm7, xmm8
238	pxor xmm7, xmm2
239
240	movdqa xmm10, [rk15]
241	movdqa xmm8, xmm3
242	pclmulqdq xmm3, xmm10, 0x11
243	pclmulqdq xmm8, xmm10, 0x0
244	pxor xmm7, xmm8
245	xorps xmm7, xmm3
246
247	movdqa xmm10, [rk17]
248	movdqa xmm8, xmm4
249	pclmulqdq xmm4, xmm10, 0x11
250	pclmulqdq xmm8, xmm10, 0x0
251	pxor xmm7, xmm8
252	pxor xmm7, xmm4
253
254	movdqa xmm10, [rk19]
255	movdqa xmm8, xmm5
256	pclmulqdq xmm5, xmm10, 0x11
257	pclmulqdq xmm8, xmm10, 0x0
258	pxor xmm7, xmm8
259	xorps xmm7, xmm5
260
261	movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
262	;imm value of pclmulqdq instruction will determine which constant to use
263	movdqa xmm8, xmm6
264	pclmulqdq xmm6, xmm10, 0x11
265	pclmulqdq xmm8, xmm10, 0x0
266	pxor xmm7, xmm8
267	pxor xmm7, xmm6
268
269
270	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
271	; instead of a cmp instruction, we use the negative flag with the jl instruction
272	add arg3, 128-16
273	jl _final_reduction_for_128
274
275	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
276	; we can fold 16 bytes at a time if y>=16
277	; continue folding 16B at a time
278
279	_16B_reduction_loop:
280	movdqa xmm8, xmm7
281	pclmulqdq xmm7, xmm10, 0x11
282	pclmulqdq xmm8, xmm10, 0x0
283	pxor xmm7, xmm8
284	movdqu xmm0, [arg2]
285	pshufb xmm0, xmm11
286	pxor xmm7, xmm0
287	add arg2, 16
288	sub arg3, 16
289	; instead of a cmp instruction, we utilize the flags with the jge instruction
290	; equivalent of: cmp arg3, 16-16
291	; check if there is any more 16B in the buffer to be able to fold
292	jge _16B_reduction_loop
293
294	;now we have 16+z bytes left to reduce, where 0<= z < 16.
295	;first, we reduce the data in the xmm7 register
296
297
298	_final_reduction_for_128:
299	; check if any more data to fold. If not, compute the CRC of the final 128 bits
300	add arg3, 16
301	je _128_done
302
303	; here we are getting data that is less than 16 bytes.
304	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
305	; after that the registers need to be adjusted.
306	_get_last_two_xmms:
307	movdqa xmm2, xmm7
308
309	movdqu xmm1, [arg2 - 16 + arg3]
310	pshufb xmm1, xmm11
311
312	; get rid of the extra data that was loaded before
313	; load the shift constant
314	lea rax, [pshufb_shf_table + 16]
315	sub rax, arg3
316	movdqu xmm0, [rax]
317
318	; shift xmm2 to the left by arg3 bytes
319	pshufb xmm2, xmm0
320
321	; shift xmm7 to the right by 16-arg3 bytes
322	pxor xmm0, [mask1]
323	pshufb xmm7, xmm0
324	pblendvb xmm1, xmm2 ;xmm0 is implicit
325
326	; fold 16 Bytes
327	movdqa xmm2, xmm1
328	movdqa xmm8, xmm7
329	pclmulqdq xmm7, xmm10, 0x11
330	pclmulqdq xmm8, xmm10, 0x0
331	pxor xmm7, xmm8
332	pxor xmm7, xmm2
333
334	_128_done:
335	; compute crc of a 128-bit value
336	movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
337	movdqa xmm0, xmm7
338
339	;64b fold
340	pclmulqdq xmm7, xmm10, 0x1
341	pslldq xmm0, 8
342	pxor xmm7, xmm0
343
344	;32b fold
345	movdqa xmm0, xmm7
346
347	pand xmm0, [mask2]
348
349	psrldq xmm7, 12
350	pclmulqdq xmm7, xmm10, 0x10
351	pxor xmm7, xmm0
352
353	;barrett reduction
354	_barrett:
355	movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
356	movdqa xmm0, xmm7
357	pclmulqdq xmm7, xmm10, 0x01
358	pslldq xmm7, 4
359	pclmulqdq xmm7, xmm10, 0x11
360
361	pslldq xmm7, 4
362	pxor xmm7, xmm0
363	pextrd eax, xmm7,1
364
365	_cleanup:
366	; scale the result back to 16 bits
367	shr eax, 16
368	%ifidn __OUTPUT_FORMAT__, win64
369	movdqa xmm6, [rsp+16*2]
370	movdqa xmm7, [rsp+16*3]
371	movdqa xmm8, [rsp+16*4]
372	movdqa xmm9, [rsp+16*5]
373	movdqa xmm10, [rsp+16*6]
374	movdqa xmm11, [rsp+16*7]
375	movdqa xmm12, [rsp+16*8]
376	movdqa xmm13, [rsp+16*9]
377	%endif
378	add rsp, VARIABLE_OFFSET
379	ret
380
381
382	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
383	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
384	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
385	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
386
387	align 16
388	_less_than_256:
389
390	; check if there is enough buffer to be able to fold 16B at a time
391	cmp arg3, 32
392	jl _less_than_32
393	movdqa xmm11, [SHUF_MASK]
394
395	; if there is, load the constants
396	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
397
398	movd xmm0, arg1_low32 ; get the initial crc value
399	pslldq xmm0, 12 ; align it to its correct place
400	movdqu xmm7, [arg2] ; load the plaintext
401	pshufb xmm7, xmm11 ; byte-reflect the plaintext
402	pxor xmm7, xmm0
403
404
405	; update the buffer pointer
406	add arg2, 16
407
408	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
409	sub arg3, 32
410
411	jmp _16B_reduction_loop
412
413
414	align 16
415	_less_than_32:
416	; mov initial crc to the return value. this is necessary for zero-length buffers.
417	mov eax, arg1_low32
418	test arg3, arg3
419	je _cleanup
420
421	movdqa xmm11, [SHUF_MASK]
422
423	movd xmm0, arg1_low32 ; get the initial crc value
424	pslldq xmm0, 12 ; align it to its correct place
425
426	cmp arg3, 16
427	je _exact_16_left
428	jl _less_than_16_left
429
430	movdqu xmm7, [arg2] ; load the plaintext
431	pshufb xmm7, xmm11 ; byte-reflect the plaintext
432	pxor xmm7, xmm0 ; xor the initial crc value
433	add arg2, 16
434	sub arg3, 16
435	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
436	jmp _get_last_two_xmms
437
438
439	align 16
440	_less_than_16_left:
441	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
442
443	pxor xmm1, xmm1
444	mov r11, rsp
445	movdqa [r11], xmm1
446
447	cmp arg3, 4
448	jl _only_less_than_4
449
450	; backup the counter value
451	mov r9, arg3
452	cmp arg3, 8
453	jl _less_than_8_left
454
455	; load 8 Bytes
456	mov rax, [arg2]
457	mov [r11], rax
458	add r11, 8
459	sub arg3, 8
460	add arg2, 8
461	_less_than_8_left:
462
463	cmp arg3, 4
464	jl _less_than_4_left
465
466	; load 4 Bytes
467	mov eax, [arg2]
468	mov [r11], eax
469	add r11, 4
470	sub arg3, 4
471	add arg2, 4
472	_less_than_4_left:
473
474	cmp arg3, 2
475	jl _less_than_2_left
476
477	; load 2 Bytes
478	mov ax, [arg2]
479	mov [r11], ax
480	add r11, 2
481	sub arg3, 2
482	add arg2, 2
483	_less_than_2_left:
484	cmp arg3, 1
485	jl _zero_left
486
487	; load 1 Byte
488	mov al, [arg2]
489	mov [r11], al
490	_zero_left:
491	movdqa xmm7, [rsp]
492	pshufb xmm7, xmm11
493	pxor xmm7, xmm0 ; xor the initial crc value
494
495	lea rax, [pshufb_shf_table + 16]
496	sub rax, r9
497	movdqu xmm0, [rax]
498	pxor xmm0, [mask1]
499
500	pshufb xmm7, xmm0
501	jmp _128_done
502
503	align 16
504	_exact_16_left:
505	movdqu xmm7, [arg2]
506	pshufb xmm7, xmm11
507	pxor xmm7, xmm0 ; xor the initial crc value
508
509	jmp _128_done
510
511	_only_less_than_4:
512	cmp arg3, 3
513	jl _only_less_than_3
514
515	; load 3 Bytes
516	mov al, [arg2]
517	mov [r11], al
518
519	mov al, [arg2+1]
520	mov [r11+1], al
521
522	mov al, [arg2+2]
523	mov [r11+2], al
524
525	movdqa xmm7, [rsp]
526	pshufb xmm7, xmm11
527	pxor xmm7, xmm0 ; xor the initial crc value
528
529	psrldq xmm7, 5
530
531	jmp _barrett
532	_only_less_than_3:
533	cmp arg3, 2
534	jl _only_less_than_2
535
536	; load 2 Bytes
537	mov al, [arg2]
538	mov [r11], al
539
540	mov al, [arg2+1]
541	mov [r11+1], al
542
543	movdqa xmm7, [rsp]
544	pshufb xmm7, xmm11
545	pxor xmm7, xmm0 ; xor the initial crc value
546
547	psrldq xmm7, 6
548
549	jmp _barrett
550	_only_less_than_2:
551
552	; load 1 Byte
553	mov al, [arg2]
554	mov [r11], al
555
556	movdqa xmm7, [rsp]
557	pshufb xmm7, xmm11
558	pxor xmm7, xmm0 ; xor the initial crc value
559
560	psrldq xmm7, 7
561
562	jmp _barrett
563
564	section .data
565
566	; precomputed constants
567	; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
568	align 16
569	; Q = 0x18BB70000
570	; rk1 = 2^(32*3) mod Q << 32
571	; rk2 = 2^(32*5) mod Q << 32
572	; rk3 = 2^(32*15) mod Q << 32
573	; rk4 = 2^(32*17) mod Q << 32
574	; rk5 = 2^(32*3) mod Q << 32
575	; rk6 = 2^(32*2) mod Q << 32
576	; rk7 = floor(2^64/Q)
577	; rk8 = Q
578	rk1:
579	DQ 0x2d56000000000000
580	rk2:
581	DQ 0x06df000000000000
582	rk3:
583	DQ 0x9d9d000000000000
584	rk4:
585	DQ 0x7cf5000000000000
586	rk5:
587	DQ 0x2d56000000000000
588	rk6:
589	DQ 0x1368000000000000
590	rk7:
591	DQ 0x00000001f65a57f8
592	rk8:
593	DQ 0x000000018bb70000
594
595	rk9:
596	DQ 0xceae000000000000
597	rk10:
598	DQ 0xbfd6000000000000
599	rk11:
600	DQ 0x1e16000000000000
601	rk12:
602	DQ 0x713c000000000000
603	rk13:
604	DQ 0xf7f9000000000000
605	rk14:
606	DQ 0x80a6000000000000
607	rk15:
608	DQ 0x044c000000000000
609	rk16:
610	DQ 0xe658000000000000
611	rk17:
612	DQ 0xad18000000000000
613	rk18:
614	DQ 0xa497000000000000
615	rk19:
616	DQ 0x6ee3000000000000
617	rk20:
618	DQ 0xe7b5000000000000
619
620
621
622
623
624
625
626
627
628	mask1:
629	dq 0x8080808080808080, 0x8080808080808080
630	mask2:
631	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
632
633	SHUF_MASK:
634	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
635
636	pshufb_shf_table:
637	; use these values for shift constants for the pshufb instruction
638	; different alignments result in values as shown:
639	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
640	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
641	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
642	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
643	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
644	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
645	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
646	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
647	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
648	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
649	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
650	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
651	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
652	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
653	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
654	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
655	dq 0x0706050403020100, 0x000e0d0c0b0a0908
656
657	;;; func core, ver, snum
658	slversion crc16_t10dif_01, 01, 06, 0010
659