[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_02.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       UINT16 crc16_t10dif_02(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif

%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif

align 16
mk_global 	crc16_t10dif_02, function
crc16_t10dif_02:
	endbranch

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl	arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.

	sub	rsp, VARIABLE_OFFSET
%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	vmovdqa [rsp+16*2],xmm6
	vmovdqa [rsp+16*3],xmm7
	vmovdqa [rsp+16*4],xmm8
	vmovdqa [rsp+16*5],xmm9
	vmovdqa [rsp+16*6],xmm10
	vmovdqa [rsp+16*7],xmm11
	vmovdqa [rsp+16*8],xmm12
	vmovdqa [rsp+16*9],xmm13
%endif

	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	vmovd	xmm10, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	vpslldq	xmm10, 12

	vmovdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	vmovdqu	xmm0, [arg2+16*0]
	vmovdqu	xmm1, [arg2+16*1]
	vmovdqu	xmm2, [arg2+16*2]
	vmovdqu	xmm3, [arg2+16*3]
	vmovdqu	xmm4, [arg2+16*4]
	vmovdqu	xmm5, [arg2+16*5]
	vmovdqu	xmm6, [arg2+16*6]
	vmovdqu	xmm7, [arg2+16*7]

	vpshufb	xmm0, xmm11
	; XOR the initial_crc value
	vpxor	xmm0, xmm10
	vpshufb	xmm1, xmm11
	vpshufb	xmm2, xmm11
	vpshufb	xmm3, xmm11
	vpshufb	xmm4, xmm11
	vpshufb	xmm5, xmm11
	vpshufb	xmm6, xmm11
	vpshufb	xmm7, xmm11

	vmovdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	prefetchnta [arg2+fetch_dist+0]
	vmovdqu	xmm9, [arg2+16*0]
	vmovdqu	xmm12, [arg2+16*1]
	vpshufb	xmm9, xmm11
	vpshufb	xmm12, xmm11
	vmovdqa	xmm8, xmm0
	vmovdqa	xmm13, xmm1
	vpclmulqdq	xmm0, xmm10, 0x0
	vpclmulqdq	xmm8, xmm10 , 0x11
	vpclmulqdq	xmm1, xmm10, 0x0
	vpclmulqdq	xmm13, xmm10 , 0x11
	vpxor	xmm0, xmm9
	vxorps	xmm0, xmm8
	vpxor	xmm1, xmm12
	vxorps	xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
	vmovdqu	xmm9, [arg2+16*2]
	vmovdqu	xmm12, [arg2+16*3]
	vpshufb	xmm9, xmm11
	vpshufb	xmm12, xmm11
	vmovdqa	xmm8, xmm2
	vmovdqa	xmm13, xmm3
	vpclmulqdq	xmm2, xmm10, 0x0
	vpclmulqdq	xmm8, xmm10 , 0x11
	vpclmulqdq	xmm3, xmm10, 0x0
	vpclmulqdq	xmm13, xmm10 , 0x11
	vpxor	xmm2, xmm9
	vxorps	xmm2, xmm8
	vpxor	xmm3, xmm12
	vxorps	xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
	vmovdqu	xmm9, [arg2+16*4]
	vmovdqu	xmm12, [arg2+16*5]
	vpshufb	xmm9, xmm11
	vpshufb	xmm12, xmm11
	vmovdqa	xmm8, xmm4
	vmovdqa	xmm13, xmm5
	vpclmulqdq	xmm4, xmm10, 0x0
	vpclmulqdq	xmm8, xmm10 , 0x11
	vpclmulqdq	xmm5, xmm10, 0x0
	vpclmulqdq	xmm13, xmm10 , 0x11
	vpxor	xmm4, xmm9
	vxorps	xmm4, xmm8
	vpxor	xmm5, xmm12
	vxorps	xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
	vmovdqu	xmm9, [arg2+16*6]
	vmovdqu	xmm12, [arg2+16*7]
	vpshufb	xmm9, xmm11
	vpshufb	xmm12, xmm11
	vmovdqa	xmm8, xmm6
	vmovdqa	xmm13, xmm7
	vpclmulqdq	xmm6, xmm10, 0x0
	vpclmulqdq	xmm8, xmm10 , 0x11
	vpclmulqdq	xmm7, xmm10, 0x0
	vpclmulqdq	xmm13, xmm10 , 0x11
	vpxor	xmm6, xmm9
	vxorps	xmm6, xmm8
	vpxor	xmm7, xmm12
	vxorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; fold the 8 xmm registers to 1 xmm register with different constants

	vmovdqa	xmm10, [rk9]
	vmovdqa	xmm8, xmm0
	vpclmulqdq	xmm0, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vxorps	xmm7, xmm0

	vmovdqa	xmm10, [rk11]
	vmovdqa	xmm8, xmm1
	vpclmulqdq	xmm1, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vxorps	xmm7, xmm1

	vmovdqa	xmm10, [rk13]
	vmovdqa	xmm8, xmm2
	vpclmulqdq	xmm2, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm2

	vmovdqa	xmm10, [rk15]
	vmovdqa	xmm8, xmm3
	vpclmulqdq	xmm3, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vxorps	xmm7, xmm3

	vmovdqa	xmm10, [rk17]
	vmovdqa	xmm8, xmm4
	vpclmulqdq	xmm4, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm4

	vmovdqa	xmm10, [rk19]
	vmovdqa	xmm8, xmm5
	vpclmulqdq	xmm5, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vxorps	xmm7, xmm5

	vmovdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
				;imm value of pclmulqdq instruction will determine which constant to use
	vmovdqa	xmm8, xmm6
	vpclmulqdq	xmm6, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	vmovdqa	xmm8, xmm7
	vpclmulqdq	xmm7, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vmovdqu	xmm0, [arg2]
	vpshufb	xmm0, xmm11
	vpxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	vmovdqa	xmm2, xmm7

	vmovdqu	xmm1, [arg2 - 16 + arg3]
	vpshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	vmovdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	vpshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	vpxor	xmm0, [mask1]
	vpshufb	xmm7, xmm0
	vpblendvb	xmm1, xmm1, xmm2, xmm0

	; fold 16 Bytes
	vmovdqa	xmm2, xmm1
	vmovdqa	xmm8, xmm7
	vpclmulqdq	xmm7, xmm10, 0x11
	vpclmulqdq	xmm8, xmm10, 0x0
	vpxor	xmm7, xmm8
	vpxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	vmovdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	vmovdqa	xmm0, xmm7

	;64b fold
	vpclmulqdq	xmm7, xmm10, 0x1
	vpslldq	xmm0, 8
	vpxor	xmm7, xmm0

	;32b fold
	vmovdqa	xmm0, xmm7

	vpand	xmm0, [mask2]

	vpsrldq	xmm7, 12
	vpclmulqdq	xmm7, xmm10, 0x10
	vpxor	xmm7, xmm0

	;barrett reduction
_barrett:
	vmovdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	vmovdqa	xmm0, xmm7
	vpclmulqdq	xmm7, xmm10, 0x01
	vpslldq	xmm7, 4
	vpclmulqdq	xmm7, xmm10, 0x11

	vpslldq	xmm7, 4
	vpxor	xmm7, xmm0
	vpextrd	eax, xmm7,1

_cleanup:
	; scale the result back to 16 bits
	shr	eax, 16
%ifidn __OUTPUT_FORMAT__, win64
	vmovdqa	xmm6, [rsp+16*2]
	vmovdqa	xmm7, [rsp+16*3]
	vmovdqa	xmm8, [rsp+16*4]
	vmovdqa	xmm9, [rsp+16*5]
	vmovdqa	xmm10, [rsp+16*6]
	vmovdqa	xmm11, [rsp+16*7]
	vmovdqa	xmm12, [rsp+16*8]
	vmovdqa	xmm13, [rsp+16*9]
%endif
	add	rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	vmovdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12	; align it to its correct place
	vmovdqu	xmm7, [arg2]	; load the plaintext
	vpshufb	xmm7, xmm11	; byte-reflect the plaintext
	vpxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	vmovdqa xmm11, [SHUF_MASK]

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	vmovdqu	xmm7, [arg2]	; load the plaintext
	vpshufb	xmm7, xmm11	; byte-reflect the plaintext
	vpxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	vmovdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	vpxor	xmm1, xmm1
	mov	r11, rsp
	vmovdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm11
	vpxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	vmovdqu	xmm0, [rax]
	vpxor	xmm0, [mask1]

	vpshufb	xmm7, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	vmovdqu	xmm7, [arg2]
	vpshufb	xmm7, xmm11
	vpxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm11
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm11
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm11
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 7

	jmp	_barrett

section .data

; precomputed constants
; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
align 16
; Q = 0x18BB70000
; rk1 = 2^(32*3) mod Q << 32
; rk2 = 2^(32*5) mod Q << 32
; rk3 = 2^(32*15) mod Q << 32
; rk4 = 2^(32*17) mod Q << 32
; rk5 = 2^(32*3) mod Q << 32
; rk6 = 2^(32*2) mod Q << 32
; rk7 = floor(2^64/Q)
; rk8 = Q
rk1:
DQ 0x2d56000000000000
rk2:
DQ 0x06df000000000000
rk3:
DQ 0x9d9d000000000000
rk4:
DQ 0x7cf5000000000000
rk5:
DQ 0x2d56000000000000
rk6:
DQ 0x1368000000000000
rk7:
DQ 0x00000001f65a57f8
rk8:
DQ 0x000000018bb70000

rk9:
DQ 0xceae000000000000
rk10:
DQ 0xbfd6000000000000
rk11:
DQ 0x1e16000000000000
rk12:
DQ 0x713c000000000000
rk13:
DQ 0xf7f9000000000000
rk14:
DQ 0x80a6000000000000
rk15:
DQ 0x044c000000000000
rk16:
DQ 0xe658000000000000
rk17:
DQ 0xad18000000000000
rk18:
DQ 0xa497000000000000
rk19:
DQ 0x6ee3000000000000
rk20:
DQ 0xe7b5000000000000

mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; UINT16 crc16_t10dif_02(
	32	; UINT16 init_crc, //initial CRC value, 16 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	44
	45	%include "reg_sizes.asm"
	46
	47	%define fetch_dist 1024
	48
	49	[bits 64]
	50	default rel
	51
	52	section .text
	53
	54	%ifidn __OUTPUT_FORMAT__, win64
	55	%xdefine arg1 rcx
	56	%xdefine arg2 rdx
	57	%xdefine arg3 r8
	58
	59	%xdefine arg1_low32 ecx
	60	%else
	61	%xdefine arg1 rdi
	62	%xdefine arg2 rsi
	63	%xdefine arg3 rdx
	64
65	%xdefine arg1_low32 edi
66	%endif
67
68	%ifidn __OUTPUT_FORMAT__, win64
69	%define XMM_SAVE 16*2
70	%define VARIABLE_OFFSET 16*10+8
71	%else
72	%define VARIABLE_OFFSET 16*2+8
73	%endif
74
75	align 16
20effc67	76	mk_global crc16_t10dif_02, function
f91f0fd5	77	crc16_t10dif_02:
20effc67	78	endbranch
f91f0fd5 TL	79
	80	; adjust the 16-bit initial_crc value, scale it to 32 bits
	81	shl arg1_low32, 16
	82
	83	; After this point, code flow is exactly same as a 32-bit CRC.
	84	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
	85
	86	sub rsp, VARIABLE_OFFSET
	87	%ifidn __OUTPUT_FORMAT__, win64
	88	; push the xmm registers into the stack to maintain
	89	vmovdqa [rsp+16*2],xmm6
	90	vmovdqa [rsp+16*3],xmm7
	91	vmovdqa [rsp+16*4],xmm8
	92	vmovdqa [rsp+16*5],xmm9
	93	vmovdqa [rsp+16*6],xmm10
	94	vmovdqa [rsp+16*7],xmm11
	95	vmovdqa [rsp+16*8],xmm12
	96	vmovdqa [rsp+16*9],xmm13
	97	%endif
	98
	99	; check if smaller than 256
	100	cmp arg3, 256
	101
	102	; for sizes less than 256, we can't fold 128B at a time...
	103	jl _less_than_256
	104
	105
	106	; load the initial crc value
	107	vmovd xmm10, arg1_low32 ; initial crc
	108
	109	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	110	; because data will be byte-reflected and will align with initial crc at correct place.
	111	vpslldq xmm10, 12
	112
	113	vmovdqa xmm11, [SHUF_MASK]
	114	; receive the initial 128B data, xor the initial crc value
	115	vmovdqu xmm0, [arg2+16*0]
	116	vmovdqu xmm1, [arg2+16*1]
	117	vmovdqu xmm2, [arg2+16*2]
	118	vmovdqu xmm3, [arg2+16*3]
	119	vmovdqu xmm4, [arg2+16*4]
	120	vmovdqu xmm5, [arg2+16*5]
	121	vmovdqu xmm6, [arg2+16*6]
	122	vmovdqu xmm7, [arg2+16*7]
	123
	124	vpshufb xmm0, xmm11
	125	; XOR the initial_crc value
	126	vpxor xmm0, xmm10
	127	vpshufb xmm1, xmm11
	128	vpshufb xmm2, xmm11
	129	vpshufb xmm3, xmm11
	130	vpshufb xmm4, xmm11
	131	vpshufb xmm5, xmm11
	132	vpshufb xmm6, xmm11
	133	vpshufb xmm7, xmm11
	134
	135	vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
	136	;imm value of pclmulqdq instruction will determine which constant to use
	137	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	138	; we subtract 256 instead of 128 to save one instruction from the loop
	139	sub arg3, 256
	140
	141	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	142	; loop will fold 128B at a time until we have 128+y Bytes of buffer
143
144
145	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
146	_fold_128_B_loop:
147
148	; update the buffer pointer
149	add arg2, 128 ; buf += 128;
150
151	prefetchnta [arg2+fetch_dist+0]
152	vmovdqu xmm9, [arg2+16*0]
153	vmovdqu xmm12, [arg2+16*1]
154	vpshufb xmm9, xmm11
155	vpshufb xmm12, xmm11
156	vmovdqa xmm8, xmm0
157	vmovdqa xmm13, xmm1
158	vpclmulqdq xmm0, xmm10, 0x0
159	vpclmulqdq xmm8, xmm10 , 0x11
160	vpclmulqdq xmm1, xmm10, 0x0
161	vpclmulqdq xmm13, xmm10 , 0x11
162	vpxor xmm0, xmm9
163	vxorps xmm0, xmm8
164	vpxor xmm1, xmm12
165	vxorps xmm1, xmm13
166
167	prefetchnta [arg2+fetch_dist+32]
168	vmovdqu xmm9, [arg2+16*2]
169	vmovdqu xmm12, [arg2+16*3]
170	vpshufb xmm9, xmm11
171	vpshufb xmm12, xmm11
172	vmovdqa xmm8, xmm2
173	vmovdqa xmm13, xmm3
174	vpclmulqdq xmm2, xmm10, 0x0
175	vpclmulqdq xmm8, xmm10 , 0x11
176	vpclmulqdq xmm3, xmm10, 0x0
177	vpclmulqdq xmm13, xmm10 , 0x11
178	vpxor xmm2, xmm9
179	vxorps xmm2, xmm8
180	vpxor xmm3, xmm12
181	vxorps xmm3, xmm13
182
183	prefetchnta [arg2+fetch_dist+64]
184	vmovdqu xmm9, [arg2+16*4]
185	vmovdqu xmm12, [arg2+16*5]
186	vpshufb xmm9, xmm11
187	vpshufb xmm12, xmm11
188	vmovdqa xmm8, xmm4
189	vmovdqa xmm13, xmm5
190	vpclmulqdq xmm4, xmm10, 0x0
191	vpclmulqdq xmm8, xmm10 , 0x11
192	vpclmulqdq xmm5, xmm10, 0x0
193	vpclmulqdq xmm13, xmm10 , 0x11
194	vpxor xmm4, xmm9
195	vxorps xmm4, xmm8
196	vpxor xmm5, xmm12
197	vxorps xmm5, xmm13
198
199	prefetchnta [arg2+fetch_dist+96]
200	vmovdqu xmm9, [arg2+16*6]
201	vmovdqu xmm12, [arg2+16*7]
202	vpshufb xmm9, xmm11
203	vpshufb xmm12, xmm11
204	vmovdqa xmm8, xmm6
205	vmovdqa xmm13, xmm7
206	vpclmulqdq xmm6, xmm10, 0x0
207	vpclmulqdq xmm8, xmm10 , 0x11
208	vpclmulqdq xmm7, xmm10, 0x0
209	vpclmulqdq xmm13, xmm10 , 0x11
210	vpxor xmm6, xmm9
211	vxorps xmm6, xmm8
212	vpxor xmm7, xmm12
213	vxorps xmm7, xmm13
214
215	sub arg3, 128
216
217	; check if there is another 128B in the buffer to be able to fold
218	jge _fold_128_B_loop
219	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
220
221
222	add arg2, 128
223	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
224	; fold the 8 xmm registers to 1 xmm register with different constants
225
226	vmovdqa xmm10, [rk9]
227	vmovdqa xmm8, xmm0
228	vpclmulqdq xmm0, xmm10, 0x11
229	vpclmulqdq xmm8, xmm10, 0x0
230	vpxor xmm7, xmm8
231	vxorps xmm7, xmm0
232
233	vmovdqa xmm10, [rk11]
234	vmovdqa xmm8, xmm1
235	vpclmulqdq xmm1, xmm10, 0x11
236	vpclmulqdq xmm8, xmm10, 0x0
237	vpxor xmm7, xmm8
238	vxorps xmm7, xmm1
239
240	vmovdqa xmm10, [rk13]
241	vmovdqa xmm8, xmm2
242	vpclmulqdq xmm2, xmm10, 0x11
243	vpclmulqdq xmm8, xmm10, 0x0
244	vpxor xmm7, xmm8
245	vpxor xmm7, xmm2
246
247	vmovdqa xmm10, [rk15]
248	vmovdqa xmm8, xmm3
249	vpclmulqdq xmm3, xmm10, 0x11
250	vpclmulqdq xmm8, xmm10, 0x0
251	vpxor xmm7, xmm8
252	vxorps xmm7, xmm3
253
254	vmovdqa xmm10, [rk17]
255	vmovdqa xmm8, xmm4
256	vpclmulqdq xmm4, xmm10, 0x11
257	vpclmulqdq xmm8, xmm10, 0x0
258	vpxor xmm7, xmm8
259	vpxor xmm7, xmm4
260
261	vmovdqa xmm10, [rk19]
262	vmovdqa xmm8, xmm5
263	vpclmulqdq xmm5, xmm10, 0x11
264	vpclmulqdq xmm8, xmm10, 0x0
265	vpxor xmm7, xmm8
266	vxorps xmm7, xmm5
267
268	vmovdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
269	;imm value of pclmulqdq instruction will determine which constant to use
270	vmovdqa xmm8, xmm6
271	vpclmulqdq xmm6, xmm10, 0x11
272	vpclmulqdq xmm8, xmm10, 0x0
273	vpxor xmm7, xmm8
274	vpxor xmm7, xmm6
275
276
277	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
278	; instead of a cmp instruction, we use the negative flag with the jl instruction
279	add arg3, 128-16
280	jl _final_reduction_for_128
281
282	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
283	; we can fold 16 bytes at a time if y>=16
284	; continue folding 16B at a time
285
286	_16B_reduction_loop:
287	vmovdqa xmm8, xmm7
288	vpclmulqdq xmm7, xmm10, 0x11
289	vpclmulqdq xmm8, xmm10, 0x0
290	vpxor xmm7, xmm8
291	vmovdqu xmm0, [arg2]
292	vpshufb xmm0, xmm11
293	vpxor xmm7, xmm0
294	add arg2, 16
295	sub arg3, 16
296	; instead of a cmp instruction, we utilize the flags with the jge instruction
297	; equivalent of: cmp arg3, 16-16
298	; check if there is any more 16B in the buffer to be able to fold
299	jge _16B_reduction_loop
300
301	;now we have 16+z bytes left to reduce, where 0<= z < 16.
302	;first, we reduce the data in the xmm7 register
303
304
305	_final_reduction_for_128:
306	; check if any more data to fold. If not, compute the CRC of the final 128 bits
307	add arg3, 16
308	je _128_done
309
310	; here we are getting data that is less than 16 bytes.
311	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
312	; after that the registers need to be adjusted.
313	_get_last_two_xmms:
314	vmovdqa xmm2, xmm7
315
316	vmovdqu xmm1, [arg2 - 16 + arg3]
317	vpshufb xmm1, xmm11
318
319	; get rid of the extra data that was loaded before
320	; load the shift constant
321	lea rax, [pshufb_shf_table + 16]
322	sub rax, arg3
323	vmovdqu xmm0, [rax]
324
325	; shift xmm2 to the left by arg3 bytes
326	vpshufb xmm2, xmm0
327
328	; shift xmm7 to the right by 16-arg3 bytes
329	vpxor xmm0, [mask1]
330	vpshufb xmm7, xmm0
331	vpblendvb xmm1, xmm1, xmm2, xmm0
332
333	; fold 16 Bytes
334	vmovdqa xmm2, xmm1
335	vmovdqa xmm8, xmm7
336	vpclmulqdq xmm7, xmm10, 0x11
337	vpclmulqdq xmm8, xmm10, 0x0
338	vpxor xmm7, xmm8
339	vpxor xmm7, xmm2
340
341	_128_done:
342	; compute crc of a 128-bit value
343	vmovdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
344	vmovdqa xmm0, xmm7
345
346	;64b fold
347	vpclmulqdq xmm7, xmm10, 0x1
348	vpslldq xmm0, 8
349	vpxor xmm7, xmm0
350
351	;32b fold
352	vmovdqa xmm0, xmm7
353
354	vpand xmm0, [mask2]
355
356	vpsrldq xmm7, 12
357	vpclmulqdq xmm7, xmm10, 0x10
358	vpxor xmm7, xmm0
359
360	;barrett reduction
361	_barrett:
362	vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
363	vmovdqa xmm0, xmm7
364	vpclmulqdq xmm7, xmm10, 0x01
365	vpslldq xmm7, 4
366	vpclmulqdq xmm7, xmm10, 0x11
367
368	vpslldq xmm7, 4
369	vpxor xmm7, xmm0
370	vpextrd eax, xmm7,1
371
372	_cleanup:
373	; scale the result back to 16 bits
374	shr eax, 16
375	%ifidn __OUTPUT_FORMAT__, win64
376	vmovdqa xmm6, [rsp+16*2]
377	vmovdqa xmm7, [rsp+16*3]
378	vmovdqa xmm8, [rsp+16*4]
379	vmovdqa xmm9, [rsp+16*5]
380	vmovdqa xmm10, [rsp+16*6]
381	vmovdqa xmm11, [rsp+16*7]
382	vmovdqa xmm12, [rsp+16*8]
383	vmovdqa xmm13, [rsp+16*9]
384	%endif
385	add rsp, VARIABLE_OFFSET
386	ret
387
388
389	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
392	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
393
394	align 16
395	_less_than_256:
396
397	; check if there is enough buffer to be able to fold 16B at a time
398	cmp arg3, 32
399	jl _less_than_32
400	vmovdqa xmm11, [SHUF_MASK]
401
402	; if there is, load the constants
403	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
404
405	vmovd xmm0, arg1_low32 ; get the initial crc value
406	vpslldq xmm0, 12 ; align it to its correct place
407	vmovdqu xmm7, [arg2] ; load the plaintext
408	vpshufb xmm7, xmm11 ; byte-reflect the plaintext
409	vpxor xmm7, xmm0
410
411
412	; update the buffer pointer
413	add arg2, 16
414
415	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
416	sub arg3, 32
417
418	jmp _16B_reduction_loop
419
420
421	align 16
422	_less_than_32:
423	; mov initial crc to the return value. this is necessary for zero-length buffers.
424	mov eax, arg1_low32
425	test arg3, arg3
426	je _cleanup
427
428	vmovdqa xmm11, [SHUF_MASK]
429
430	vmovd xmm0, arg1_low32 ; get the initial crc value
431	vpslldq xmm0, 12 ; align it to its correct place
432
433	cmp arg3, 16
434	je _exact_16_left
435	jl _less_than_16_left
436
437	vmovdqu xmm7, [arg2] ; load the plaintext
438	vpshufb xmm7, xmm11 ; byte-reflect the plaintext
439	vpxor xmm7, xmm0 ; xor the initial crc value
440	add arg2, 16
441	sub arg3, 16
442	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
443	jmp _get_last_two_xmms
444
445
446	align 16
447	_less_than_16_left:
448	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
449
450	vpxor xmm1, xmm1
451	mov r11, rsp
452	vmovdqa [r11], xmm1
453
454	cmp arg3, 4
455	jl _only_less_than_4
456
457	; backup the counter value
458	mov r9, arg3
459	cmp arg3, 8
460	jl _less_than_8_left
461
462	; load 8 Bytes
463	mov rax, [arg2]
464	mov [r11], rax
465	add r11, 8
466	sub arg3, 8
467	add arg2, 8
468	_less_than_8_left:
469
470	cmp arg3, 4
471	jl _less_than_4_left
472
473	; load 4 Bytes
474	mov eax, [arg2]
475	mov [r11], eax
476	add r11, 4
477	sub arg3, 4
478	add arg2, 4
479	_less_than_4_left:
480
481	cmp arg3, 2
482	jl _less_than_2_left
483
484	; load 2 Bytes
485	mov ax, [arg2]
486	mov [r11], ax
487	add r11, 2
488	sub arg3, 2
489	add arg2, 2
490	_less_than_2_left:
491	cmp arg3, 1
492	jl _zero_left
493
494	; load 1 Byte
495	mov al, [arg2]
496	mov [r11], al
497	_zero_left:
498	vmovdqa xmm7, [rsp]
499	vpshufb xmm7, xmm11
500	vpxor xmm7, xmm0 ; xor the initial crc value
501
502	lea rax, [pshufb_shf_table + 16]
503	sub rax, r9
504	vmovdqu xmm0, [rax]
505	vpxor xmm0, [mask1]
506
507	vpshufb xmm7, xmm0
508	jmp _128_done
509
510	align 16
511	_exact_16_left:
512	vmovdqu xmm7, [arg2]
513	vpshufb xmm7, xmm11
514	vpxor xmm7, xmm0 ; xor the initial crc value
515
516	jmp _128_done
517
518	_only_less_than_4:
519	cmp arg3, 3
520	jl _only_less_than_3
521
522	; load 3 Bytes
523	mov al, [arg2]
524	mov [r11], al
525
526	mov al, [arg2+1]
527	mov [r11+1], al
528
529	mov al, [arg2+2]
530	mov [r11+2], al
531
532	vmovdqa xmm7, [rsp]
533	vpshufb xmm7, xmm11
534	vpxor xmm7, xmm0 ; xor the initial crc value
535
536	vpsrldq xmm7, 5
537
538	jmp _barrett
539	_only_less_than_3:
540	cmp arg3, 2
541	jl _only_less_than_2
542
543	; load 2 Bytes
544	mov al, [arg2]
545	mov [r11], al
546
547	mov al, [arg2+1]
548	mov [r11+1], al
549
550	vmovdqa xmm7, [rsp]
551	vpshufb xmm7, xmm11
552	vpxor xmm7, xmm0 ; xor the initial crc value
553
554	vpsrldq xmm7, 6
555
556	jmp _barrett
557	_only_less_than_2:
558
559	; load 1 Byte
560	mov al, [arg2]
561	mov [r11], al
562
563	vmovdqa xmm7, [rsp]
564	vpshufb xmm7, xmm11
565	vpxor xmm7, xmm0 ; xor the initial crc value
566
567	vpsrldq xmm7, 7
568
569	jmp _barrett
570
571	section .data
572
573	; precomputed constants
574	; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
575	align 16
576	; Q = 0x18BB70000
577	; rk1 = 2^(32*3) mod Q << 32
578	; rk2 = 2^(32*5) mod Q << 32
579	; rk3 = 2^(32*15) mod Q << 32
580	; rk4 = 2^(32*17) mod Q << 32
581	; rk5 = 2^(32*3) mod Q << 32
582	; rk6 = 2^(32*2) mod Q << 32
583	; rk7 = floor(2^64/Q)
584	; rk8 = Q
585	rk1:
586	DQ 0x2d56000000000000
587	rk2:
588	DQ 0x06df000000000000
589	rk3:
590	DQ 0x9d9d000000000000
591	rk4:
592	DQ 0x7cf5000000000000
593	rk5:
594	DQ 0x2d56000000000000
595	rk6:
596	DQ 0x1368000000000000
597	rk7:
598	DQ 0x00000001f65a57f8
599	rk8:
600	DQ 0x000000018bb70000
601
602	rk9:
603	DQ 0xceae000000000000
604	rk10:
605	DQ 0xbfd6000000000000
606	rk11:
607	DQ 0x1e16000000000000
608	rk12:
609	DQ 0x713c000000000000
610	rk13:
611	DQ 0xf7f9000000000000
612	rk14:
613	DQ 0x80a6000000000000
614	rk15:
615	DQ 0x044c000000000000
616	rk16:
617	DQ 0xe658000000000000
618	rk17:
619	DQ 0xad18000000000000
620	rk18:
621	DQ 0xa497000000000000
622	rk19:
623	DQ 0x6ee3000000000000
624	rk20:
625	DQ 0xe7b5000000000000
626
627	mask1:
628	dq 0x8080808080808080, 0x8080808080808080
629	mask2:
630	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
631
632	SHUF_MASK:
633	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
634
635	pshufb_shf_table:
636	; use these values for shift constants for the pshufb instruction
637	; different alignments result in values as shown:
638	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
639	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
640	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
641	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
642	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
643	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
644	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
645	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
646	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
647	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
648	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
649	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
650	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
651	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
652	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
653	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
654	dq 0x0706050403020100, 0x000e0d0c0b0a0908