[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_01.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       UINT16 crc16_t10dif_01(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif

%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif

align 16
global	crc16_t10dif_01:function
crc16_t10dif_01:

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl	arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.

	sub	rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	movdqa [rsp+16*2],xmm6
	movdqa [rsp+16*3],xmm7
	movdqa [rsp+16*4],xmm8
	movdqa [rsp+16*5],xmm9
	movdqa [rsp+16*6],xmm10
	movdqa [rsp+16*7],xmm11
	movdqa [rsp+16*8],xmm12
	movdqa [rsp+16*9],xmm13
%endif

	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	movd	xmm10, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	pslldq	xmm10, 12

	movdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	movdqu	xmm0, [arg2+16*0]
	movdqu	xmm1, [arg2+16*1]
	movdqu	xmm2, [arg2+16*2]
	movdqu	xmm3, [arg2+16*3]
	movdqu	xmm4, [arg2+16*4]
	movdqu	xmm5, [arg2+16*5]
	movdqu	xmm6, [arg2+16*6]
	movdqu	xmm7, [arg2+16*7]

	pshufb	xmm0, xmm11
	; XOR the initial_crc value
	pxor	xmm0, xmm10
	pshufb	xmm1, xmm11
	pshufb	xmm2, xmm11
	pshufb	xmm3, xmm11
	pshufb	xmm4, xmm11
	pshufb	xmm5, xmm11
	pshufb	xmm6, xmm11
	pshufb	xmm7, xmm11

	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm9, [arg2+16*0]
	movdqu	xmm12, [arg2+16*1]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm0
	movdqa	xmm13, xmm1
	pclmulqdq	xmm0, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm1, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm0, xmm9
	xorps	xmm0, xmm8
	pxor	xmm1, xmm12
	xorps	xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm9, [arg2+16*2]
	movdqu	xmm12, [arg2+16*3]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm2
	movdqa	xmm13, xmm3
	pclmulqdq	xmm2, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm3, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm2, xmm9
	xorps	xmm2, xmm8
	pxor	xmm3, xmm12
	xorps	xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
	movdqu	xmm9, [arg2+16*4]
	movdqu	xmm12, [arg2+16*5]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm4
	movdqa	xmm13, xmm5
	pclmulqdq	xmm4, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm5, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm4, xmm9
	xorps	xmm4, xmm8
	pxor	xmm5, xmm12
	xorps	xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
	movdqu	xmm9, [arg2+16*6]
	movdqu	xmm12, [arg2+16*7]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm6
	movdqa	xmm13, xmm7
	pclmulqdq	xmm6, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm7, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm6, xmm9
	xorps	xmm6, xmm8
	pxor	xmm7, xmm12
	xorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; fold the 8 xmm registers to 1 xmm register with different constants

	movdqa	xmm10, [rk9]
	movdqa	xmm8, xmm0
	pclmulqdq	xmm0, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm0

	movdqa	xmm10, [rk11]
	movdqa	xmm8, xmm1
	pclmulqdq	xmm1, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm1

	movdqa	xmm10, [rk13]
	movdqa	xmm8, xmm2
	pclmulqdq	xmm2, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

	movdqa	xmm10, [rk15]
	movdqa	xmm8, xmm3
	pclmulqdq	xmm3, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm3

	movdqa	xmm10, [rk17]
	movdqa	xmm8, xmm4
	pclmulqdq	xmm4, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm4

	movdqa	xmm10, [rk19]
	movdqa	xmm8, xmm5
	pclmulqdq	xmm5, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm5

	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
				;imm value of pclmulqdq instruction will determine which constant to use
	movdqa	xmm8, xmm6
	pclmulqdq	xmm6, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm11
	pxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm7

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm7, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	movdqa	xmm0, xmm7

	;64b fold
	pclmulqdq	xmm7, xmm10, 0x1
	pslldq	xmm0, 8
	pxor	xmm7, xmm0

	;32b fold
	movdqa	xmm0, xmm7

	pand	xmm0, [mask2]

	psrldq	xmm7, 12
	pclmulqdq	xmm7, xmm10, 0x10
	pxor	xmm7, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	movdqa	xmm0, xmm7
	pclmulqdq	xmm7, xmm10, 0x01
	pslldq	xmm7, 4
	pclmulqdq	xmm7, xmm10, 0x11

	pslldq	xmm7, 4
	pxor	xmm7, xmm0
	pextrd	eax, xmm7,1

_cleanup:
	; scale the result back to 16 bits
	shr	eax, 16
%ifidn __OUTPUT_FORMAT__, win64
	movdqa	xmm6, [rsp+16*2]
	movdqa	xmm7, [rsp+16*3]
	movdqa	xmm8, [rsp+16*4]
	movdqa	xmm9, [rsp+16*5]
	movdqa	xmm10, [rsp+16*6]
	movdqa	xmm11, [rsp+16*7]
	movdqa	xmm12, [rsp+16*8]
	movdqa	xmm13, [rsp+16*9]
%endif
	add	rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place
	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	movdqa xmm11, [SHUF_MASK]

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm7, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	movdqu	xmm7, [arg2]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 7

	jmp	_barrett

section .data

; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x9d9d000000000000
rk4:
DQ 0x7cf5000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000

rk9:
DQ 0xceae000000000000
rk10:
DQ 0xbfd6000000000000
rk11:
DQ 0x1e16000000000000
rk12:
DQ 0x713c000000000000
rk13:
DQ 0xf7f9000000000000
rk14:
DQ 0x80a6000000000000
rk15:
DQ 0x044c000000000000
rk16:
DQ 0xe658000000000000
rk17:
DQ 0xad18000000000000
rk18:
DQ 0xa497000000000000
rk19:
DQ 0x6ee3000000000000
rk20:
DQ 0xe7b5000000000000


mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

;;;       func          core, ver, snum
slversion crc16_t10dif_01, 01,   06,  0010
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; UINT16 crc16_t10dif_01(
	32	; UINT16 init_crc, //initial CRC value, 16 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	44
	45	%include "reg_sizes.asm"
	46
224ce89b WB	47	%define fetch_dist 1024
224ce89b WB	48
7c673cae FG	49	[bits 64]
	50	default rel
	51
	52	section .text
	53
	54	%ifidn __OUTPUT_FORMAT__, win64
	55	%xdefine arg1 rcx
	56	%xdefine arg2 rdx
	57	%xdefine arg3 r8
	58
	59	%xdefine arg1_low32 ecx
	60	%else
	61	%xdefine arg1 rdi
	62	%xdefine arg2 rsi
	63	%xdefine arg3 rdx
	64
	65	%xdefine arg1_low32 edi
	66	%endif
	67
	68	%ifidn __OUTPUT_FORMAT__, win64
	69	%define XMM_SAVE 16*2
	70	%define VARIABLE_OFFSET 16*10+8
	71	%else
	72	%define VARIABLE_OFFSET 16*2+8
	73	%endif
	74
	75	align 16
	76	global crc16_t10dif_01:function
	77	crc16_t10dif_01:
	78
	79	; adjust the 16-bit initial_crc value, scale it to 32 bits
	80	shl arg1_low32, 16
	81
	82	; After this point, code flow is exactly same as a 32-bit CRC.
	83	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
	84
	85	sub rsp, VARIABLE_OFFSET
	86	%ifidn __OUTPUT_FORMAT__, win64
	87	; push the xmm registers into the stack to maintain
	88	movdqa [rsp+16*2],xmm6
	89	movdqa [rsp+16*3],xmm7
	90	movdqa [rsp+16*4],xmm8
	91	movdqa [rsp+16*5],xmm9
	92	movdqa [rsp+16*6],xmm10
	93	movdqa [rsp+16*7],xmm11
	94	movdqa [rsp+16*8],xmm12
	95	movdqa [rsp+16*9],xmm13
	96	%endif
	97
	98	; check if smaller than 256
	99	cmp arg3, 256
	100
	101	; for sizes less than 256, we can't fold 128B at a time...
	102	jl _less_than_256
	103
	104
	105	; load the initial crc value
	106	movd xmm10, arg1_low32 ; initial crc
	107
	108	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	109	; because data will be byte-reflected and will align with initial crc at correct place.
	110	pslldq xmm10, 12
	111
	112	movdqa xmm11, [SHUF_MASK]
113	; receive the initial 128B data, xor the initial crc value
114	movdqu xmm0, [arg2+16*0]
115	movdqu xmm1, [arg2+16*1]
116	movdqu xmm2, [arg2+16*2]
117	movdqu xmm3, [arg2+16*3]
118	movdqu xmm4, [arg2+16*4]
119	movdqu xmm5, [arg2+16*5]
120	movdqu xmm6, [arg2+16*6]
121	movdqu xmm7, [arg2+16*7]
122
123	pshufb xmm0, xmm11
124	; XOR the initial_crc value
125	pxor xmm0, xmm10
126	pshufb xmm1, xmm11
127	pshufb xmm2, xmm11
128	pshufb xmm3, xmm11
129	pshufb xmm4, xmm11
130	pshufb xmm5, xmm11
131	pshufb xmm6, xmm11
132	pshufb xmm7, xmm11
133
134	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
135	;imm value of pclmulqdq instruction will determine which constant to use
136	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137	; we subtract 256 instead of 128 to save one instruction from the loop
138	sub arg3, 256
139
140	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
141	; loop will fold 128B at a time until we have 128+y Bytes of buffer
142
143
144	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
145	_fold_128_B_loop:
146
147	; update the buffer pointer
148	add arg2, 128 ; buf += 128;
149
224ce89b	150	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	151	movdqu xmm9, [arg2+16*0]
	152	movdqu xmm12, [arg2+16*1]
	153	pshufb xmm9, xmm11
	154	pshufb xmm12, xmm11
	155	movdqa xmm8, xmm0
	156	movdqa xmm13, xmm1
	157	pclmulqdq xmm0, xmm10, 0x0
	158	pclmulqdq xmm8, xmm10 , 0x11
	159	pclmulqdq xmm1, xmm10, 0x0
	160	pclmulqdq xmm13, xmm10 , 0x11
	161	pxor xmm0, xmm9
	162	xorps xmm0, xmm8
	163	pxor xmm1, xmm12
	164	xorps xmm1, xmm13
	165
224ce89b	166	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	167	movdqu xmm9, [arg2+16*2]
	168	movdqu xmm12, [arg2+16*3]
	169	pshufb xmm9, xmm11
	170	pshufb xmm12, xmm11
	171	movdqa xmm8, xmm2
	172	movdqa xmm13, xmm3
	173	pclmulqdq xmm2, xmm10, 0x0
	174	pclmulqdq xmm8, xmm10 , 0x11
	175	pclmulqdq xmm3, xmm10, 0x0
	176	pclmulqdq xmm13, xmm10 , 0x11
	177	pxor xmm2, xmm9
	178	xorps xmm2, xmm8
	179	pxor xmm3, xmm12
	180	xorps xmm3, xmm13
	181
224ce89b	182	prefetchnta [arg2+fetch_dist+64]
7c673cae FG	183	movdqu xmm9, [arg2+16*4]
	184	movdqu xmm12, [arg2+16*5]
	185	pshufb xmm9, xmm11
	186	pshufb xmm12, xmm11
	187	movdqa xmm8, xmm4
	188	movdqa xmm13, xmm5
	189	pclmulqdq xmm4, xmm10, 0x0
	190	pclmulqdq xmm8, xmm10 , 0x11
	191	pclmulqdq xmm5, xmm10, 0x0
	192	pclmulqdq xmm13, xmm10 , 0x11
	193	pxor xmm4, xmm9
	194	xorps xmm4, xmm8
	195	pxor xmm5, xmm12
	196	xorps xmm5, xmm13
	197
224ce89b	198	prefetchnta [arg2+fetch_dist+96]
7c673cae FG	199	movdqu xmm9, [arg2+16*6]
	200	movdqu xmm12, [arg2+16*7]
	201	pshufb xmm9, xmm11
	202	pshufb xmm12, xmm11
	203	movdqa xmm8, xmm6
	204	movdqa xmm13, xmm7
	205	pclmulqdq xmm6, xmm10, 0x0
	206	pclmulqdq xmm8, xmm10 , 0x11
	207	pclmulqdq xmm7, xmm10, 0x0
	208	pclmulqdq xmm13, xmm10 , 0x11
	209	pxor xmm6, xmm9
	210	xorps xmm6, xmm8
	211	pxor xmm7, xmm12
	212	xorps xmm7, xmm13
	213
	214	sub arg3, 128
	215
	216	; check if there is another 128B in the buffer to be able to fold
	217	jge _fold_128_B_loop
	218	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	219
	220
	221	add arg2, 128
	222	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	223	; fold the 8 xmm registers to 1 xmm register with different constants
	224
	225	movdqa xmm10, [rk9]
	226	movdqa xmm8, xmm0
	227	pclmulqdq xmm0, xmm10, 0x11
	228	pclmulqdq xmm8, xmm10, 0x0
	229	pxor xmm7, xmm8
	230	xorps xmm7, xmm0
	231
	232	movdqa xmm10, [rk11]
	233	movdqa xmm8, xmm1
	234	pclmulqdq xmm1, xmm10, 0x11
	235	pclmulqdq xmm8, xmm10, 0x0
	236	pxor xmm7, xmm8
	237	xorps xmm7, xmm1
	238
	239	movdqa xmm10, [rk13]
	240	movdqa xmm8, xmm2
	241	pclmulqdq xmm2, xmm10, 0x11
	242	pclmulqdq xmm8, xmm10, 0x0
	243	pxor xmm7, xmm8
	244	pxor xmm7, xmm2
	245
	246	movdqa xmm10, [rk15]
	247	movdqa xmm8, xmm3
	248	pclmulqdq xmm3, xmm10, 0x11
	249	pclmulqdq xmm8, xmm10, 0x0
	250	pxor xmm7, xmm8
	251	xorps xmm7, xmm3
	252
	253	movdqa xmm10, [rk17]
	254	movdqa xmm8, xmm4
	255	pclmulqdq xmm4, xmm10, 0x11
	256	pclmulqdq xmm8, xmm10, 0x0
	257	pxor xmm7, xmm8
	258	pxor xmm7, xmm4
	259
	260	movdqa xmm10, [rk19]
	261	movdqa xmm8, xmm5
	262	pclmulqdq xmm5, xmm10, 0x11
263	pclmulqdq xmm8, xmm10, 0x0
264	pxor xmm7, xmm8
265	xorps xmm7, xmm5
266
267	movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
268	;imm value of pclmulqdq instruction will determine which constant to use
269	movdqa xmm8, xmm6
270	pclmulqdq xmm6, xmm10, 0x11
271	pclmulqdq xmm8, xmm10, 0x0
272	pxor xmm7, xmm8
273	pxor xmm7, xmm6
274
275
276	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277	; instead of a cmp instruction, we use the negative flag with the jl instruction
278	add arg3, 128-16
279	jl _final_reduction_for_128
280
281	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282	; we can fold 16 bytes at a time if y>=16
283	; continue folding 16B at a time
284
285	_16B_reduction_loop:
286	movdqa xmm8, xmm7
287	pclmulqdq xmm7, xmm10, 0x11
288	pclmulqdq xmm8, xmm10, 0x0
289	pxor xmm7, xmm8
290	movdqu xmm0, [arg2]
291	pshufb xmm0, xmm11
292	pxor xmm7, xmm0
293	add arg2, 16
294	sub arg3, 16
295	; instead of a cmp instruction, we utilize the flags with the jge instruction
296	; equivalent of: cmp arg3, 16-16
297	; check if there is any more 16B in the buffer to be able to fold
298	jge _16B_reduction_loop
299
300	;now we have 16+z bytes left to reduce, where 0<= z < 16.
301	;first, we reduce the data in the xmm7 register
302
303
304	_final_reduction_for_128:
305	; check if any more data to fold. If not, compute the CRC of the final 128 bits
306	add arg3, 16
307	je _128_done
308
309	; here we are getting data that is less than 16 bytes.
310	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311	; after that the registers need to be adjusted.
312	_get_last_two_xmms:
313	movdqa xmm2, xmm7
314
315	movdqu xmm1, [arg2 - 16 + arg3]
316	pshufb xmm1, xmm11
317
318	; get rid of the extra data that was loaded before
319	; load the shift constant
320	lea rax, [pshufb_shf_table + 16]
321	sub rax, arg3
322	movdqu xmm0, [rax]
323
324	; shift xmm2 to the left by arg3 bytes
325	pshufb xmm2, xmm0
326
327	; shift xmm7 to the right by 16-arg3 bytes
328	pxor xmm0, [mask1]
329	pshufb xmm7, xmm0
330	pblendvb xmm1, xmm2 ;xmm0 is implicit
331
332	; fold 16 Bytes
333	movdqa xmm2, xmm1
334	movdqa xmm8, xmm7
335	pclmulqdq xmm7, xmm10, 0x11
336	pclmulqdq xmm8, xmm10, 0x0
337	pxor xmm7, xmm8
338	pxor xmm7, xmm2
339
340	_128_done:
341	; compute crc of a 128-bit value
342	movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
343	movdqa xmm0, xmm7
344
345	;64b fold
346	pclmulqdq xmm7, xmm10, 0x1
347	pslldq xmm0, 8
348	pxor xmm7, xmm0
349
350	;32b fold
351	movdqa xmm0, xmm7
352
353	pand xmm0, [mask2]
354
355	psrldq xmm7, 12
356	pclmulqdq xmm7, xmm10, 0x10
357	pxor xmm7, xmm0
358
359	;barrett reduction
360	_barrett:
361	movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
362	movdqa xmm0, xmm7
363	pclmulqdq xmm7, xmm10, 0x01
364	pslldq xmm7, 4
365	pclmulqdq xmm7, xmm10, 0x11
366
367	pslldq xmm7, 4
368	pxor xmm7, xmm0
369	pextrd eax, xmm7,1
370
371	_cleanup:
372	; scale the result back to 16 bits
373	shr eax, 16
374	%ifidn __OUTPUT_FORMAT__, win64
375	movdqa xmm6, [rsp+16*2]
376	movdqa xmm7, [rsp+16*3]
377	movdqa xmm8, [rsp+16*4]
378	movdqa xmm9, [rsp+16*5]
379	movdqa xmm10, [rsp+16*6]
380	movdqa xmm11, [rsp+16*7]
381	movdqa xmm12, [rsp+16*8]
382	movdqa xmm13, [rsp+16*9]
383	%endif
384	add rsp, VARIABLE_OFFSET
385	ret
386
387
388	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392
393	align 16
394	_less_than_256:
395
396	; check if there is enough buffer to be able to fold 16B at a time
397	cmp arg3, 32
398	jl _less_than_32
399	movdqa xmm11, [SHUF_MASK]
400
401	; if there is, load the constants
402	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
403
404	movd xmm0, arg1_low32 ; get the initial crc value
405	pslldq xmm0, 12 ; align it to its correct place
406	movdqu xmm7, [arg2] ; load the plaintext
407	pshufb xmm7, xmm11 ; byte-reflect the plaintext
408	pxor xmm7, xmm0
409
410
411	; update the buffer pointer
412	add arg2, 16
413
414	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
415	sub arg3, 32
416
417	jmp _16B_reduction_loop
418
419
420	align 16
421	_less_than_32:
422	; mov initial crc to the return value. this is necessary for zero-length buffers.
423	mov eax, arg1_low32
424	test arg3, arg3
425	je _cleanup
426
427	movdqa xmm11, [SHUF_MASK]
428
429	movd xmm0, arg1_low32 ; get the initial crc value
430	pslldq xmm0, 12 ; align it to its correct place
431
432	cmp arg3, 16
433	je _exact_16_left
434	jl _less_than_16_left
435
436	movdqu xmm7, [arg2] ; load the plaintext
437	pshufb xmm7, xmm11 ; byte-reflect the plaintext
438	pxor xmm7, xmm0 ; xor the initial crc value
439	add arg2, 16
440	sub arg3, 16
441	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
442	jmp _get_last_two_xmms
443
444
445	align 16
446	_less_than_16_left:
447	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
448
449	pxor xmm1, xmm1
450	mov r11, rsp
451	movdqa [r11], xmm1
452
453	cmp arg3, 4
454	jl _only_less_than_4
455
456	; backup the counter value
457	mov r9, arg3
458	cmp arg3, 8
459	jl _less_than_8_left
460
461	; load 8 Bytes
462	mov rax, [arg2]
463	mov [r11], rax
464	add r11, 8
465	sub arg3, 8
466	add arg2, 8
467	_less_than_8_left:
468
469	cmp arg3, 4
470	jl _less_than_4_left
471
472	; load 4 Bytes
473	mov eax, [arg2]
474	mov [r11], eax
475	add r11, 4
476	sub arg3, 4
477	add arg2, 4
478	_less_than_4_left:
479
480	cmp arg3, 2
481	jl _less_than_2_left
482
483	; load 2 Bytes
484	mov ax, [arg2]
485	mov [r11], ax
486	add r11, 2
487	sub arg3, 2
488	add arg2, 2
489	_less_than_2_left:
490	cmp arg3, 1
491	jl _zero_left
492
493	; load 1 Byte
494	mov al, [arg2]
495	mov [r11], al
496	_zero_left:
497	movdqa xmm7, [rsp]
498	pshufb xmm7, xmm11
499	pxor xmm7, xmm0 ; xor the initial crc value
500
501	lea rax, [pshufb_shf_table + 16]
502	sub rax, r9
503	movdqu xmm0, [rax]
504	pxor xmm0, [mask1]
505
506	pshufb xmm7, xmm0
507	jmp _128_done
508
509	align 16
510	_exact_16_left:
511	movdqu xmm7, [arg2]
512	pshufb xmm7, xmm11
513	pxor xmm7, xmm0 ; xor the initial crc value
514
515	jmp _128_done
516
517	_only_less_than_4:
518	cmp arg3, 3
519	jl _only_less_than_3
520
521	; load 3 Bytes
522	mov al, [arg2]
523	mov [r11], al
524
525	mov al, [arg2+1]
526	mov [r11+1], al
527
528	mov al, [arg2+2]
529	mov [r11+2], al
530
531	movdqa xmm7, [rsp]
532	pshufb xmm7, xmm11
533	pxor xmm7, xmm0 ; xor the initial crc value
534
535	psrldq xmm7, 5
536
537	jmp _barrett
538	_only_less_than_3:
539	cmp arg3, 2
540	jl _only_less_than_2
541
542	; load 2 Bytes
543	mov al, [arg2]
544	mov [r11], al
545
546	mov al, [arg2+1]
547	mov [r11+1], al
548
549	movdqa xmm7, [rsp]
550	pshufb xmm7, xmm11
551	pxor xmm7, xmm0 ; xor the initial crc value
552
553	psrldq xmm7, 6
554
555	jmp _barrett
556	_only_less_than_2:
557
558	; load 1 Byte
559	mov al, [arg2]
560	mov [r11], al
561
562	movdqa xmm7, [rsp]
563	pshufb xmm7, xmm11
564	pxor xmm7, xmm0 ; xor the initial crc value
565
566	psrldq xmm7, 7
567
568	jmp _barrett
569
570	section .data
571
572	; precomputed constants
573	; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
574	align 16
575	; Q = 0x18BB70000
576	; rk1 = 2^(32*3) mod Q << 32
577	; rk2 = 2^(32*5) mod Q << 32
578	; rk3 = 2^(32*15) mod Q << 32
579	; rk4 = 2^(32*17) mod Q << 32
580	; rk5 = 2^(32*3) mod Q << 32
581	; rk6 = 2^(32*2) mod Q << 32
582	; rk7 = floor(2^64/Q)
583	; rk8 = Q
584	rk1:
585	DQ 0x2d56000000000000
586	rk2:
587	DQ 0x06df000000000000
588	rk3:
589	DQ 0x9d9d000000000000
590	rk4:
591	DQ 0x7cf5000000000000
592	rk5:
593	DQ 0x2d56000000000000
594	rk6:
595	DQ 0x1368000000000000
596	rk7:
597	DQ 0x00000001f65a57f8
598	rk8:
599	DQ 0x000000018bb70000
600
601	rk9:
602	DQ 0xceae000000000000
603	rk10:
604	DQ 0xbfd6000000000000
605	rk11:
606	DQ 0x1e16000000000000
607	rk12:
608	DQ 0x713c000000000000
609	rk13:
610	DQ 0xf7f9000000000000
611	rk14:
612	DQ 0x80a6000000000000
613	rk15:
614	DQ 0x044c000000000000
615	rk16:
616	DQ 0xe658000000000000
617	rk17:
618	DQ 0xad18000000000000
619	rk18:
620	DQ 0xa497000000000000
621	rk19:
622	DQ 0x6ee3000000000000
623	rk20:
624	DQ 0xe7b5000000000000
625
626
627
628
629
630
631
632
633
634	mask1:
635	dq 0x8080808080808080, 0x8080808080808080
636	mask2:
637	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
638
639	SHUF_MASK:
640	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
641
642	pshufb_shf_table:
643	; use these values for shift constants for the pshufb instruction
644	; different alignments result in values as shown:
645	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
646	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
647	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
648	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
649	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
650	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
651	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
652	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
653	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
654	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
655	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
656	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
657	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
658	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
659	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
660	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
661	dq 0x0706050403020100, 0x000e0d0c0b0a0908
662
663	;;; func core, ver, snum
664	slversion crc16_t10dif_01, 01, 06, 0010
665