[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by16_10.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc16_t10dif_by16_10(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
;
;

%include "reg_sizes.asm"

%ifndef FUNCTION_NAME
%define FUNCTION_NAME crc16_t10dif_by16_10
%endif

%if (AS_FEATURE_LEVEL) >= 10

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
	%xdefine	arg1 rcx
	%xdefine	arg2 rdx
	%xdefine	arg3 r8

	%xdefine	arg1_low32 ecx
%else
	%xdefine	arg1 rdi
	%xdefine	arg2 rsi
	%xdefine	arg3 rdx

	%xdefine	arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*12+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
mk_global FUNCTION_NAME, function
FUNCTION_NAME:
	endbranch

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl		arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.

	sub		rsp, VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
%endif

	vbroadcasti32x4 zmm18, [SHUF_MASK]
	cmp		arg3, 256
	jl		.less_than_256

	; load the initial crc value
	vmovd		xmm10, arg1_low32      ; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	vpslldq		xmm10, 12

	; receive the initial 64B data, xor the initial crc value
	vmovdqu8	zmm0, [arg2+16*0]
	vmovdqu8	zmm4, [arg2+16*4]
	vpshufb		zmm0, zmm0, zmm18
	vpshufb		zmm4, zmm4, zmm18
	vpxorq		zmm0, zmm10
	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use

	sub		arg3, 256
	cmp		arg3, 256
	jl		.fold_128_B_loop

	vmovdqu8	zmm7, [arg2+16*8]
	vmovdqu8	zmm8, [arg2+16*12]
	vpshufb		zmm7, zmm7, zmm18
	vpshufb		zmm8, zmm8, zmm18
	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
	sub		arg3, 256

.fold_256_B_loop:
	add		arg2, 256
	vmovdqu8	zmm3, [arg2+16*0]
	vpshufb		zmm3, zmm3, zmm18
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vpxorq		zmm0, zmm1, zmm2
	vpxorq		zmm0, zmm0, zmm3

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm16, 0x00
	vpclmulqdq	zmm6, zmm4, zmm16, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	vmovdqu8	zmm11, [arg2+16*8]
	vpshufb		zmm11, zmm11, zmm18
	vpclmulqdq	zmm12, zmm7, zmm16, 0x00
	vpclmulqdq	zmm13, zmm7, zmm16, 0x11
	vpxorq		zmm7, zmm12, zmm13
	vpxorq		zmm7, zmm7, zmm11

	vmovdqu8	zmm17, [arg2+16*12]
	vpshufb		zmm17, zmm17, zmm18
	vpclmulqdq	zmm14, zmm8, zmm16, 0x00
	vpclmulqdq	zmm15, zmm8, zmm16, 0x11
	vpxorq		zmm8, zmm14, zmm15
	vpxorq		zmm8, zmm8, zmm17

	sub		arg3, 256
	jge     	.fold_256_B_loop

	;; Fold 256 into 128
	add		arg2, 256
	vpclmulqdq	zmm1, zmm0, zmm10, 0x00
	vpclmulqdq	zmm2, zmm0, zmm10, 0x11
	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC

	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC

	vmovdqa32	zmm0, zmm7
	vmovdqa32	zmm4, zmm8

	add		arg3, 128
	jmp		.fold_128_B_register


	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer

	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
.fold_128_B_loop:
	add		arg2, 128
	vmovdqu8	zmm8, [arg2+16*0]
	vpshufb		zmm8, zmm8, zmm18
	vpclmulqdq	zmm2, zmm0, zmm10, 0x00
	vpclmulqdq	zmm1, zmm0, zmm10, 0x11
	vpxorq		zmm0, zmm2, zmm1
	vpxorq		zmm0, zmm0, zmm8

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	sub		arg3, 128
	jge		.fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add		arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

.fold_128_B_register:
	; fold the 8 128b parts into 1 xmm register with different constants
	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand

	vpclmulqdq	zmm5, zmm4, zmm11, 0x00
	vpclmulqdq	zmm6, zmm4, zmm11, 0x11
	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC

	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
	vpxorq          ymm8, ymm8, ymm1
	vextracti64x2   xmm5, ymm8, 1
	vpxorq          xmm7, xmm5, xmm8

	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add		arg3, 128-16
	jl		.final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

.16B_reduction_loop:
	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vmovdqu		xmm0, [arg2]
	vpshufb		xmm0, xmm0, xmm18
	vpxor		xmm7, xmm0
	add		arg2, 16
	sub		arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge		.16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


.final_reduction_for_128:
	add		arg3, 16
	je		.128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
.get_last_two_xmms:

	vmovdqa		xmm2, xmm7
	vmovdqu		xmm1, [arg2 - 16 + arg3]
	vpshufb		xmm1, xmm18

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea		rax, [pshufb_shf_table + 16]
	sub		rax, arg3
	vmovdqu		xmm0, [rax]

	vpshufb		xmm2, xmm0
	vpxor		xmm0, [mask1]
	vpshufb		xmm7, xmm0
	vpblendvb	xmm1, xmm1, xmm2, xmm0

	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vpxor		xmm7, xmm1

.128_done:
	; compute crc of a 128-bit value
	vmovdqa		xmm10, [rk5]
	vmovdqa		xmm0, xmm7

	;64b fold
	vpclmulqdq	xmm7, xmm10, 0x01	; H*L
	vpslldq		xmm0, 8
	vpxor		xmm7, xmm0

	;32b fold
	vmovdqa		xmm0, xmm7
	vpand		xmm0, [mask2]
	vpsrldq		xmm7, 12
	vpclmulqdq	xmm7, xmm10, 0x10
	vpxor		xmm7, xmm0

	;barrett reduction
.barrett:
	vmovdqa		xmm10, [rk7]	; rk7 and rk8 in xmm10
	vmovdqa		xmm0, xmm7
	vpclmulqdq	xmm7, xmm10, 0x01
	vpslldq		xmm7, 4
	vpclmulqdq	xmm7, xmm10, 0x11

	vpslldq		xmm7, 4
	vpxor		xmm7, xmm0
	vpextrd		eax, xmm7, 1

.cleanup:
	; scale the result back to 16 bits
	shr		eax, 16

%ifidn __OUTPUT_FORMAT__, win64
	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
%endif
	add		rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
.less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	.less_than_32

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place
	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18		; byte-reflect the plaintext
	vpxor	xmm7, xmm0

	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	.16B_reduction_loop


align 16
.less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	.cleanup

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	.exact_16_left
	jl	.less_than_16_left

	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0		; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
	jmp	.get_last_two_xmms

align 16
.less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	vpxor	xmm1, xmm1
	mov	r11, rsp
	vmovdqa	[r11], xmm1

	cmp	arg3, 4
	jl	.only_less_than_4

	; backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	.less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
.less_than_8_left:

	cmp	arg3, 4
	jl	.less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
.less_than_4_left:

	cmp	arg3, 2
	jl	.less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
.less_than_2_left:
	cmp	arg3, 1
	jl	.zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

.zero_left:
	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	vmovdqu	xmm0, [rax]
	vpxor	xmm0, [mask1]

	vpshufb	xmm7,xmm0
	jmp	.128_done

align 16
.exact_16_left:
	vmovdqu	xmm7, [arg2]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value
	jmp	.128_done

.only_less_than_4:
	cmp	arg3, 3
	jl	.only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 5
	jmp	.barrett

.only_less_than_3:
	cmp	arg3, 2
	jl	.only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 6
	jmp	.barrett

.only_less_than_2:
	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value

	vpsrldq	xmm7, 7
	jmp	.barrett

section .data
align 32

%ifndef USE_CONSTS
; precomputed constants

rk_1: dq 0xdccf000000000000
rk_2: dq 0x4b0b000000000000
rk1:  dq 0x2d56000000000000
rk2:  dq 0x06df000000000000
rk3:  dq 0x9d9d000000000000
rk4:  dq 0x7cf5000000000000
rk5:  dq 0x2d56000000000000
rk6:  dq 0x1368000000000000
rk7:  dq 0x00000001f65a57f8
rk8:  dq 0x000000018bb70000
rk9:  dq 0xceae000000000000
rk10: dq 0xbfd6000000000000
rk11: dq 0x1e16000000000000
rk12: dq 0x713c000000000000
rk13: dq 0xf7f9000000000000
rk14: dq 0x80a6000000000000
rk15: dq 0x044c000000000000
rk16: dq 0xe658000000000000
rk17: dq 0xad18000000000000
rk18: dq 0xa497000000000000
rk19: dq 0x6ee3000000000000
rk20: dq 0xe7b5000000000000

rk_1b: dq 0x2d56000000000000
rk_2b: dq 0x06df000000000000
	dq 0x0000000000000000
	dq 0x0000000000000000
%else
INCLUDE_CONSTS
%endif

mask1: dq 0x8080808080808080, 0x8080808080808080
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
%ifidn __OUTPUT_FORMAT__, win64
global no_ %+ FUNCTION_NAME
no_ %+ FUNCTION_NAME %+ :
%endif
%endif ; (AS_FEATURE_LEVEL) >= 10
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; UINT32 crc16_t10dif_by16_10(
	33	; UINT16 init_crc, //initial CRC value, 16 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; UINT64 len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	; Authors:
	39	; Erdinc Ozturk
	40	; Vinodh Gopal
	41	; James Guilford
	42	;
	43	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	44	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	45	;
	46	;
	47
	48	%include "reg_sizes.asm"
	49
	50	%ifndef FUNCTION_NAME
	51	%define FUNCTION_NAME crc16_t10dif_by16_10
	52	%endif
	53
	54	%if (AS_FEATURE_LEVEL) >= 10
	55
	56	[bits 64]
	57	default rel
	58
	59	section .text
	60
	61
	62	%ifidn __OUTPUT_FORMAT__, win64
	63	%xdefine arg1 rcx
	64	%xdefine arg2 rdx
65	%xdefine arg3 r8
66
67	%xdefine arg1_low32 ecx
68	%else
69	%xdefine arg1 rdi
70	%xdefine arg2 rsi
71	%xdefine arg3 rdx
72
73	%xdefine arg1_low32 edi
74	%endif
75
76	%define TMP 16*0
77	%ifidn __OUTPUT_FORMAT__, win64
78	%define XMM_SAVE 16*2
79	%define VARIABLE_OFFSET 16*12+8
80	%else
81	%define VARIABLE_OFFSET 16*2+8
82	%endif
83
84	align 16
20effc67	85	mk_global FUNCTION_NAME, function
f91f0fd5	86	FUNCTION_NAME:
20effc67	87	endbranch
f91f0fd5 TL	88
	89	; adjust the 16-bit initial_crc value, scale it to 32 bits
	90	shl arg1_low32, 16
	91
	92	; After this point, code flow is exactly same as a 32-bit CRC.
	93	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
	94
	95	sub rsp, VARIABLE_OFFSET
	96
	97	%ifidn __OUTPUT_FORMAT__, win64
	98	; push the xmm registers into the stack to maintain
	99	vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
	100	vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
	101	vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
	102	vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
	103	vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
	104	vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
	105	vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
	106	vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
	107	vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
	108	vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
	109	%endif
	110
	111	vbroadcasti32x4 zmm18, [SHUF_MASK]
	112	cmp arg3, 256
	113	jl .less_than_256
	114
	115	; load the initial crc value
	116	vmovd xmm10, arg1_low32 ; initial crc
	117
	118	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	119	; because data will be byte-reflected and will align with initial crc at correct place.
	120	vpslldq xmm10, 12
	121
	122	; receive the initial 64B data, xor the initial crc value
	123	vmovdqu8 zmm0, [arg2+16*0]
	124	vmovdqu8 zmm4, [arg2+16*4]
	125	vpshufb zmm0, zmm0, zmm18
	126	vpshufb zmm4, zmm4, zmm18
	127	vpxorq zmm0, zmm10
	128	vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
	129	;imm value of pclmulqdq instruction will determine which constant to use
	130
	131	sub arg3, 256
	132	cmp arg3, 256
	133	jl .fold_128_B_loop
	134
	135	vmovdqu8 zmm7, [arg2+16*8]
	136	vmovdqu8 zmm8, [arg2+16*12]
	137	vpshufb zmm7, zmm7, zmm18
	138	vpshufb zmm8, zmm8, zmm18
	139	vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
	140	sub arg3, 256
	141
	142	.fold_256_B_loop:
	143	add arg2, 256
	144	vmovdqu8 zmm3, [arg2+16*0]
	145	vpshufb zmm3, zmm3, zmm18
	146	vpclmulqdq zmm1, zmm0, zmm16, 0x00
	147	vpclmulqdq zmm2, zmm0, zmm16, 0x11
	148	vpxorq zmm0, zmm1, zmm2
	149	vpxorq zmm0, zmm0, zmm3
	150
	151	vmovdqu8 zmm9, [arg2+16*4]
152	vpshufb zmm9, zmm9, zmm18
153	vpclmulqdq zmm5, zmm4, zmm16, 0x00
154	vpclmulqdq zmm6, zmm4, zmm16, 0x11
155	vpxorq zmm4, zmm5, zmm6
156	vpxorq zmm4, zmm4, zmm9
157
158	vmovdqu8 zmm11, [arg2+16*8]
159	vpshufb zmm11, zmm11, zmm18
160	vpclmulqdq zmm12, zmm7, zmm16, 0x00
161	vpclmulqdq zmm13, zmm7, zmm16, 0x11
162	vpxorq zmm7, zmm12, zmm13
163	vpxorq zmm7, zmm7, zmm11
164
165	vmovdqu8 zmm17, [arg2+16*12]
166	vpshufb zmm17, zmm17, zmm18
167	vpclmulqdq zmm14, zmm8, zmm16, 0x00
168	vpclmulqdq zmm15, zmm8, zmm16, 0x11
169	vpxorq zmm8, zmm14, zmm15
170	vpxorq zmm8, zmm8, zmm17
171
172	sub arg3, 256
173	jge .fold_256_B_loop
174
175	;; Fold 256 into 128
176	add arg2, 256
177	vpclmulqdq zmm1, zmm0, zmm10, 0x00
178	vpclmulqdq zmm2, zmm0, zmm10, 0x11
179	vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
180
181	vpclmulqdq zmm5, zmm4, zmm10, 0x00
182	vpclmulqdq zmm6, zmm4, zmm10, 0x11
183	vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
184
185	vmovdqa32 zmm0, zmm7
186	vmovdqa32 zmm4, zmm8
187
188	add arg3, 128
189	jmp .fold_128_B_register
190
191
192
193	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
194	; loop will fold 128B at a time until we have 128+y Bytes of buffer
195
196	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
197	.fold_128_B_loop:
198	add arg2, 128
199	vmovdqu8 zmm8, [arg2+16*0]
200	vpshufb zmm8, zmm8, zmm18
201	vpclmulqdq zmm2, zmm0, zmm10, 0x00
202	vpclmulqdq zmm1, zmm0, zmm10, 0x11
203	vpxorq zmm0, zmm2, zmm1
204	vpxorq zmm0, zmm0, zmm8
205
206	vmovdqu8 zmm9, [arg2+16*4]
207	vpshufb zmm9, zmm9, zmm18
208	vpclmulqdq zmm5, zmm4, zmm10, 0x00
209	vpclmulqdq zmm6, zmm4, zmm10, 0x11
210	vpxorq zmm4, zmm5, zmm6
211	vpxorq zmm4, zmm4, zmm9
212
213	sub arg3, 128
214	jge .fold_128_B_loop
215	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
216
217	add arg2, 128
218	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
219	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
220
221	.fold_128_B_register:
222	; fold the 8 128b parts into 1 xmm register with different constants
223	vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
224	vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
225	vpclmulqdq zmm1, zmm0, zmm16, 0x00
226	vpclmulqdq zmm2, zmm0, zmm16, 0x11
227	vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
228
229	vpclmulqdq zmm5, zmm4, zmm11, 0x00
230	vpclmulqdq zmm6, zmm4, zmm11, 0x11
231	vmovdqa xmm10, [rk1] ; Needed later in reduction loop
232	vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
233	vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
234
235	vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
236	vpxorq ymm8, ymm8, ymm1
237	vextracti64x2 xmm5, ymm8, 1
238	vpxorq xmm7, xmm5, xmm8
239
240	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
241	; instead of a cmp instruction, we use the negative flag with the jl instruction
242	add arg3, 128-16
243	jl .final_reduction_for_128
244
245	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
246	; we can fold 16 bytes at a time if y>=16
247	; continue folding 16B at a time
248
249	.16B_reduction_loop:
250	vpclmulqdq xmm8, xmm7, xmm10, 0x11
251	vpclmulqdq xmm7, xmm7, xmm10, 0x00
252	vpxor xmm7, xmm8
253	vmovdqu xmm0, [arg2]
254	vpshufb xmm0, xmm0, xmm18
255	vpxor xmm7, xmm0
256	add arg2, 16
257	sub arg3, 16
258	; instead of a cmp instruction, we utilize the flags with the jge instruction
259	; equivalent of: cmp arg3, 16-16
260	; check if there is any more 16B in the buffer to be able to fold
261	jge .16B_reduction_loop
262
263	;now we have 16+z bytes left to reduce, where 0<= z < 16.
264	;first, we reduce the data in the xmm7 register
265
266
267	.final_reduction_for_128:
268	add arg3, 16
269	je .128_done
270
271	; here we are getting data that is less than 16 bytes.
272	; since we know that there was data before the pointer, we can offset
273	; the input pointer before the actual point, to receive exactly 16 bytes.
274	; after that the registers need to be adjusted.
275	.get_last_two_xmms:
276
277	vmovdqa xmm2, xmm7
278	vmovdqu xmm1, [arg2 - 16 + arg3]
279	vpshufb xmm1, xmm18
280
281	; get rid of the extra data that was loaded before
282	; load the shift constant
283	lea rax, [pshufb_shf_table + 16]
284	sub rax, arg3
285	vmovdqu xmm0, [rax]
286
287	vpshufb xmm2, xmm0
288	vpxor xmm0, [mask1]
289	vpshufb xmm7, xmm0
290	vpblendvb xmm1, xmm1, xmm2, xmm0
291
292	vpclmulqdq xmm8, xmm7, xmm10, 0x11
293	vpclmulqdq xmm7, xmm7, xmm10, 0x00
294	vpxor xmm7, xmm8
295	vpxor xmm7, xmm1
296
297	.128_done:
298	; compute crc of a 128-bit value
299	vmovdqa xmm10, [rk5]
300	vmovdqa xmm0, xmm7
301
302	;64b fold
303	vpclmulqdq xmm7, xmm10, 0x01 ; H*L
304	vpslldq xmm0, 8
305	vpxor xmm7, xmm0
306
307	;32b fold
308	vmovdqa xmm0, xmm7
309	vpand xmm0, [mask2]
310	vpsrldq xmm7, 12
311	vpclmulqdq xmm7, xmm10, 0x10
312	vpxor xmm7, xmm0
313
314	;barrett reduction
315	.barrett:
316	vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
317	vmovdqa xmm0, xmm7
318	vpclmulqdq xmm7, xmm10, 0x01
319	vpslldq xmm7, 4
320	vpclmulqdq xmm7, xmm10, 0x11
321
322	vpslldq xmm7, 4
323	vpxor xmm7, xmm0
324	vpextrd eax, xmm7, 1
325
326	.cleanup:
327	; scale the result back to 16 bits
328	shr eax, 16
329
330	%ifidn __OUTPUT_FORMAT__, win64
331	vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
332	vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
333	vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
334	vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
335	vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
336	vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
337	vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
338	vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
339	vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
340	vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
341	%endif
342	add rsp, VARIABLE_OFFSET
343	ret
344
345
346	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
350
351	align 16
352	.less_than_256:
353
354	; check if there is enough buffer to be able to fold 16B at a time
355	cmp arg3, 32
356	jl .less_than_32
357
358	; if there is, load the constants
359	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
360
361	vmovd xmm0, arg1_low32 ; get the initial crc value
362	vpslldq xmm0, 12 ; align it to its correct place
363	vmovdqu xmm7, [arg2] ; load the plaintext
364	vpshufb xmm7, xmm18 ; byte-reflect the plaintext
365	vpxor xmm7, xmm0
366
367	; update the buffer pointer
368	add arg2, 16
369
370	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
371	sub arg3, 32
372
373	jmp .16B_reduction_loop
374
375
376	align 16
377	.less_than_32:
378	; mov initial crc to the return value. this is necessary for zero-length buffers.
379	mov eax, arg1_low32
380	test arg3, arg3
381	je .cleanup
382
383	vmovd xmm0, arg1_low32 ; get the initial crc value
384	vpslldq xmm0, 12 ; align it to its correct place
385
386	cmp arg3, 16
387	je .exact_16_left
388	jl .less_than_16_left
389
390	vmovdqu xmm7, [arg2] ; load the plaintext
391	vpshufb xmm7, xmm18
392	vpxor xmm7, xmm0 ; xor the initial crc value
393	add arg2, 16
394	sub arg3, 16
395	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
396	jmp .get_last_two_xmms
397
398	align 16
399	.less_than_16_left:
400	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
401
402	vpxor xmm1, xmm1
403	mov r11, rsp
404	vmovdqa [r11], xmm1
405
406	cmp arg3, 4
407	jl .only_less_than_4
408
409	; backup the counter value
410	mov r9, arg3
411	cmp arg3, 8
412	jl .less_than_8_left
413
414	; load 8 Bytes
415	mov rax, [arg2]
416	mov [r11], rax
417	add r11, 8
418	sub arg3, 8
419	add arg2, 8
420	.less_than_8_left:
421
422	cmp arg3, 4
423	jl .less_than_4_left
424
425	; load 4 Bytes
426	mov eax, [arg2]
427	mov [r11], eax
428	add r11, 4
429	sub arg3, 4
430	add arg2, 4
431	.less_than_4_left:
432
433	cmp arg3, 2
434	jl .less_than_2_left
435
436	; load 2 Bytes
437	mov ax, [arg2]
438	mov [r11], ax
439	add r11, 2
440	sub arg3, 2
441	add arg2, 2
442	.less_than_2_left:
443	cmp arg3, 1
444	jl .zero_left
445
446	; load 1 Byte
447	mov al, [arg2]
448	mov [r11], al
449
450	.zero_left:
451	vmovdqa xmm7, [rsp]
452	vpshufb xmm7, xmm18
453	vpxor xmm7, xmm0 ; xor the initial crc value
454
455	lea rax, [pshufb_shf_table + 16]
456	sub rax, r9
457	vmovdqu xmm0, [rax]
458	vpxor xmm0, [mask1]
459
460	vpshufb xmm7,xmm0
461	jmp .128_done
462
463	align 16
464	.exact_16_left:
465	vmovdqu xmm7, [arg2]
466	vpshufb xmm7, xmm18
467	vpxor xmm7, xmm0 ; xor the initial crc value
468	jmp .128_done
469
470	.only_less_than_4:
471	cmp arg3, 3
472	jl .only_less_than_3
473
474	; load 3 Bytes
475	mov al, [arg2]
476	mov [r11], al
477
478	mov al, [arg2+1]
479	mov [r11+1], al
480
481	mov al, [arg2+2]
482	mov [r11+2], al
483
484	vmovdqa xmm7, [rsp]
485	vpshufb xmm7, xmm18
486	vpxor xmm7, xmm0 ; xor the initial crc value
487
488	vpsrldq xmm7, 5
489	jmp .barrett
490
491	.only_less_than_3:
492	cmp arg3, 2
493	jl .only_less_than_2
494
495	; load 2 Bytes
496	mov al, [arg2]
497	mov [r11], al
498
499	mov al, [arg2+1]
500	mov [r11+1], al
501
502	vmovdqa xmm7, [rsp]
503	vpshufb xmm7, xmm18
504	vpxor xmm7, xmm0 ; xor the initial crc value
505
506	vpsrldq xmm7, 6
507	jmp .barrett
508
509	.only_less_than_2:
510	; load 1 Byte
511	mov al, [arg2]
512	mov [r11], al
513
514	vmovdqa xmm7, [rsp]
515	vpshufb xmm7, xmm18
516	vpxor xmm7, xmm0 ; xor the initial crc value
517
518	vpsrldq xmm7, 7
519	jmp .barrett
520
521	section .data
522	align 32
523
524	%ifndef USE_CONSTS
525	; precomputed constants
526
527	rk_1: dq 0xdccf000000000000
528	rk_2: dq 0x4b0b000000000000
529	rk1: dq 0x2d56000000000000
530	rk2: dq 0x06df000000000000
531	rk3: dq 0x9d9d000000000000
532	rk4: dq 0x7cf5000000000000
533	rk5: dq 0x2d56000000000000
534	rk6: dq 0x1368000000000000
535	rk7: dq 0x00000001f65a57f8
536	rk8: dq 0x000000018bb70000
537	rk9: dq 0xceae000000000000
538	rk10: dq 0xbfd6000000000000
539	rk11: dq 0x1e16000000000000
540	rk12: dq 0x713c000000000000
541	rk13: dq 0xf7f9000000000000
542	rk14: dq 0x80a6000000000000
543	rk15: dq 0x044c000000000000
544	rk16: dq 0xe658000000000000
545	rk17: dq 0xad18000000000000
546	rk18: dq 0xa497000000000000
547	rk19: dq 0x6ee3000000000000
548	rk20: dq 0xe7b5000000000000
549
550	rk_1b: dq 0x2d56000000000000
551	rk_2b: dq 0x06df000000000000
552	dq 0x0000000000000000
553	dq 0x0000000000000000
554	%else
555	INCLUDE_CONSTS
556	%endif
557
558	mask1: dq 0x8080808080808080, 0x8080808080808080
559	mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
560
561	SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
562
563	pshufb_shf_table:
564	; use these values for shift constants for the pshufb instruction
565	; different alignments result in values as shown:
566	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
567	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
568	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
569	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
570	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
571	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
572	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
573	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
574	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
575	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
576	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
577	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
578	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
579	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
580	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
581	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
582	dq 0x0706050403020100, 0x000e0d0c0b0a0908
583	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
584	dq 0x8080808080808080, 0x8080808080808080
585
586	%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
587	%ifidn __OUTPUT_FORMAT__, win64
588	global no_ %+ FUNCTION_NAME
589	no_ %+ FUNCTION_NAME %+ :
590	%endif
591	%endif ; (AS_FEATURE_LEVEL) >= 10