[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by16_10.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc16_t10dif_by16_10(
;               UINT16 init_crc, //initial CRC value, 16 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
;
;

%include "reg_sizes.asm"

%ifndef FUNCTION_NAME
%define FUNCTION_NAME crc16_t10dif_by16_10
%endif

%if (AS_FEATURE_LEVEL) >= 10

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
	%xdefine	arg1 rcx
	%xdefine	arg2 rdx
	%xdefine	arg3 r8

	%xdefine	arg1_low32 ecx
%else
	%xdefine	arg1 rdi
	%xdefine	arg2 rsi
	%xdefine	arg3 rdx

	%xdefine	arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*12+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
FUNCTION_NAME:

	; adjust the 16-bit initial_crc value, scale it to 32 bits
	shl		arg1_low32, 16

	; After this point, code flow is exactly same as a 32-bit CRC.
	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.

	sub		rsp, VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
%endif

	vbroadcasti32x4 zmm18, [SHUF_MASK]
	cmp		arg3, 256
	jl		.less_than_256

	; load the initial crc value
	vmovd		xmm10, arg1_low32      ; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	vpslldq		xmm10, 12

	; receive the initial 64B data, xor the initial crc value
	vmovdqu8	zmm0, [arg2+16*0]
	vmovdqu8	zmm4, [arg2+16*4]
	vpshufb		zmm0, zmm0, zmm18
	vpshufb		zmm4, zmm4, zmm18
	vpxorq		zmm0, zmm10
	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use

	sub		arg3, 256
	cmp		arg3, 256
	jl		.fold_128_B_loop

	vmovdqu8	zmm7, [arg2+16*8]
	vmovdqu8	zmm8, [arg2+16*12]
	vpshufb		zmm7, zmm7, zmm18
	vpshufb		zmm8, zmm8, zmm18
	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
	sub		arg3, 256

.fold_256_B_loop:
	add		arg2, 256
	vmovdqu8	zmm3, [arg2+16*0]
	vpshufb		zmm3, zmm3, zmm18
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vpxorq		zmm0, zmm1, zmm2
	vpxorq		zmm0, zmm0, zmm3

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm16, 0x00
	vpclmulqdq	zmm6, zmm4, zmm16, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	vmovdqu8	zmm11, [arg2+16*8]
	vpshufb		zmm11, zmm11, zmm18
	vpclmulqdq	zmm12, zmm7, zmm16, 0x00
	vpclmulqdq	zmm13, zmm7, zmm16, 0x11
	vpxorq		zmm7, zmm12, zmm13
	vpxorq		zmm7, zmm7, zmm11

	vmovdqu8	zmm17, [arg2+16*12]
	vpshufb		zmm17, zmm17, zmm18
	vpclmulqdq	zmm14, zmm8, zmm16, 0x00
	vpclmulqdq	zmm15, zmm8, zmm16, 0x11
	vpxorq		zmm8, zmm14, zmm15
	vpxorq		zmm8, zmm8, zmm17

	sub		arg3, 256
	jge     	.fold_256_B_loop

	;; Fold 256 into 128
	add		arg2, 256
	vpclmulqdq	zmm1, zmm0, zmm10, 0x00
	vpclmulqdq	zmm2, zmm0, zmm10, 0x11
	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC

	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC

	vmovdqa32	zmm0, zmm7
	vmovdqa32	zmm4, zmm8

	add		arg3, 128
	jmp		.fold_128_B_register


	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer

	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
.fold_128_B_loop:
	add		arg2, 128
	vmovdqu8	zmm8, [arg2+16*0]
	vpshufb		zmm8, zmm8, zmm18
	vpclmulqdq	zmm2, zmm0, zmm10, 0x00
	vpclmulqdq	zmm1, zmm0, zmm10, 0x11
	vpxorq		zmm0, zmm2, zmm1
	vpxorq		zmm0, zmm0, zmm8

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	sub		arg3, 128
	jge		.fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add		arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

.fold_128_B_register:
	; fold the 8 128b parts into 1 xmm register with different constants
	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand

	vpclmulqdq	zmm5, zmm4, zmm11, 0x00
	vpclmulqdq	zmm6, zmm4, zmm11, 0x11
	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC

	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
	vpxorq          ymm8, ymm8, ymm1
	vextracti64x2   xmm5, ymm8, 1
	vpxorq          xmm7, xmm5, xmm8

	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add		arg3, 128-16
	jl		.final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

.16B_reduction_loop:
	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vmovdqu		xmm0, [arg2]
	vpshufb		xmm0, xmm0, xmm18
	vpxor		xmm7, xmm0
	add		arg2, 16
	sub		arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge		.16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


.final_reduction_for_128:
	add		arg3, 16
	je		.128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
.get_last_two_xmms:

	vmovdqa		xmm2, xmm7
	vmovdqu		xmm1, [arg2 - 16 + arg3]
	vpshufb		xmm1, xmm18

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea		rax, [pshufb_shf_table + 16]
	sub		rax, arg3
	vmovdqu		xmm0, [rax]

	vpshufb		xmm2, xmm0
	vpxor		xmm0, [mask1]
	vpshufb		xmm7, xmm0
	vpblendvb	xmm1, xmm1, xmm2, xmm0

	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vpxor		xmm7, xmm1

.128_done:
	; compute crc of a 128-bit value
	vmovdqa		xmm10, [rk5]
	vmovdqa		xmm0, xmm7

	;64b fold
	vpclmulqdq	xmm7, xmm10, 0x01	; H*L
	vpslldq		xmm0, 8
	vpxor		xmm7, xmm0

	;32b fold
	vmovdqa		xmm0, xmm7
	vpand		xmm0, [mask2]
	vpsrldq		xmm7, 12
	vpclmulqdq	xmm7, xmm10, 0x10
	vpxor		xmm7, xmm0

	;barrett reduction
.barrett:
	vmovdqa		xmm10, [rk7]	; rk7 and rk8 in xmm10
	vmovdqa		xmm0, xmm7
	vpclmulqdq	xmm7, xmm10, 0x01
	vpslldq		xmm7, 4
	vpclmulqdq	xmm7, xmm10, 0x11

	vpslldq		xmm7, 4
	vpxor		xmm7, xmm0
	vpextrd		eax, xmm7, 1

.cleanup:
	; scale the result back to 16 bits
	shr		eax, 16

%ifidn __OUTPUT_FORMAT__, win64
	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
%endif
	add		rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
.less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	.less_than_32

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place
	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18		; byte-reflect the plaintext
	vpxor	xmm7, xmm0

	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	.16B_reduction_loop


align 16
.less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	.cleanup

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	.exact_16_left
	jl	.less_than_16_left

	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0		; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
	jmp	.get_last_two_xmms

align 16
.less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	vpxor	xmm1, xmm1
	mov	r11, rsp
	vmovdqa	[r11], xmm1

	cmp	arg3, 4
	jl	.only_less_than_4

	; backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	.less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
.less_than_8_left:

	cmp	arg3, 4
	jl	.less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
.less_than_4_left:

	cmp	arg3, 2
	jl	.less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
.less_than_2_left:
	cmp	arg3, 1
	jl	.zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

.zero_left:
	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	vmovdqu	xmm0, [rax]
	vpxor	xmm0, [mask1]

	vpshufb	xmm7,xmm0
	jmp	.128_done

align 16
.exact_16_left:
	vmovdqu	xmm7, [arg2]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value
	jmp	.128_done

.only_less_than_4:
	cmp	arg3, 3
	jl	.only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 5
	jmp	.barrett

.only_less_than_3:
	cmp	arg3, 2
	jl	.only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 6
	jmp	.barrett

.only_less_than_2:
	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value

	vpsrldq	xmm7, 7
	jmp	.barrett

section .data
align 32

%ifndef USE_CONSTS
; precomputed constants

rk_1: dq 0xdccf000000000000
rk_2: dq 0x4b0b000000000000
rk1:  dq 0x2d56000000000000
rk2:  dq 0x06df000000000000
rk3:  dq 0x9d9d000000000000
rk4:  dq 0x7cf5000000000000
rk5:  dq 0x2d56000000000000
rk6:  dq 0x1368000000000000
rk7:  dq 0x00000001f65a57f8
rk8:  dq 0x000000018bb70000
rk9:  dq 0xceae000000000000
rk10: dq 0xbfd6000000000000
rk11: dq 0x1e16000000000000
rk12: dq 0x713c000000000000
rk13: dq 0xf7f9000000000000
rk14: dq 0x80a6000000000000
rk15: dq 0x044c000000000000
rk16: dq 0xe658000000000000
rk17: dq 0xad18000000000000
rk18: dq 0xa497000000000000
rk19: dq 0x6ee3000000000000
rk20: dq 0xe7b5000000000000

rk_1b: dq 0x2d56000000000000
rk_2b: dq 0x06df000000000000
	dq 0x0000000000000000
	dq 0x0000000000000000
%else
INCLUDE_CONSTS
%endif

mask1: dq 0x8080808080808080, 0x8080808080808080
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
%ifidn __OUTPUT_FORMAT__, win64
global no_ %+ FUNCTION_NAME
no_ %+ FUNCTION_NAME %+ :
%endif
%endif ; (AS_FEATURE_LEVEL) >= 10
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; UINT32 crc16_t10dif_by16_10(
	33	; UINT16 init_crc, //initial CRC value, 16 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; UINT64 len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	; Authors:
	39	; Erdinc Ozturk
	40	; Vinodh Gopal
	41	; James Guilford
	42	;
	43	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	44	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	45	;
	46	;
	47
	48	%include "reg_sizes.asm"
	49
	50	%ifndef FUNCTION_NAME
	51	%define FUNCTION_NAME crc16_t10dif_by16_10
	52	%endif
	53
	54	%if (AS_FEATURE_LEVEL) >= 10
	55
	56	[bits 64]
	57	default rel
	58
	59	section .text
	60
	61
	62	%ifidn __OUTPUT_FORMAT__, win64
	63	%xdefine arg1 rcx
	64	%xdefine arg2 rdx
65	%xdefine arg3 r8
66
67	%xdefine arg1_low32 ecx
68	%else
69	%xdefine arg1 rdi
70	%xdefine arg2 rsi
71	%xdefine arg3 rdx
72
73	%xdefine arg1_low32 edi
74	%endif
75
76	%define TMP 16*0
77	%ifidn __OUTPUT_FORMAT__, win64
78	%define XMM_SAVE 16*2
79	%define VARIABLE_OFFSET 16*12+8
80	%else
81	%define VARIABLE_OFFSET 16*2+8
82	%endif
83
84	align 16
85	global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
86	FUNCTION_NAME:
87
88	; adjust the 16-bit initial_crc value, scale it to 32 bits
89	shl arg1_low32, 16
90
91	; After this point, code flow is exactly same as a 32-bit CRC.
92	; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
93
94	sub rsp, VARIABLE_OFFSET
95
96	%ifidn __OUTPUT_FORMAT__, win64
97	; push the xmm registers into the stack to maintain
98	vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
99	vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
100	vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
101	vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
102	vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
103	vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
104	vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
105	vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
106	vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
107	vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
108	%endif
109
110	vbroadcasti32x4 zmm18, [SHUF_MASK]
111	cmp arg3, 256
112	jl .less_than_256
113
114	; load the initial crc value
115	vmovd xmm10, arg1_low32 ; initial crc
116
117	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
118	; because data will be byte-reflected and will align with initial crc at correct place.
119	vpslldq xmm10, 12
120
121	; receive the initial 64B data, xor the initial crc value
122	vmovdqu8 zmm0, [arg2+16*0]
123	vmovdqu8 zmm4, [arg2+16*4]
124	vpshufb zmm0, zmm0, zmm18
125	vpshufb zmm4, zmm4, zmm18
126	vpxorq zmm0, zmm10
127	vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
128	;imm value of pclmulqdq instruction will determine which constant to use
129
130	sub arg3, 256
131	cmp arg3, 256
132	jl .fold_128_B_loop
133
134	vmovdqu8 zmm7, [arg2+16*8]
135	vmovdqu8 zmm8, [arg2+16*12]
136	vpshufb zmm7, zmm7, zmm18
137	vpshufb zmm8, zmm8, zmm18
138	vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
139	sub arg3, 256
140
141	.fold_256_B_loop:
142	add arg2, 256
143	vmovdqu8 zmm3, [arg2+16*0]
144	vpshufb zmm3, zmm3, zmm18
145	vpclmulqdq zmm1, zmm0, zmm16, 0x00
146	vpclmulqdq zmm2, zmm0, zmm16, 0x11
147	vpxorq zmm0, zmm1, zmm2
148	vpxorq zmm0, zmm0, zmm3
149
150	vmovdqu8 zmm9, [arg2+16*4]
151	vpshufb zmm9, zmm9, zmm18
152	vpclmulqdq zmm5, zmm4, zmm16, 0x00
153	vpclmulqdq zmm6, zmm4, zmm16, 0x11
154	vpxorq zmm4, zmm5, zmm6
155	vpxorq zmm4, zmm4, zmm9
156
157	vmovdqu8 zmm11, [arg2+16*8]
158	vpshufb zmm11, zmm11, zmm18
159	vpclmulqdq zmm12, zmm7, zmm16, 0x00
160	vpclmulqdq zmm13, zmm7, zmm16, 0x11
161	vpxorq zmm7, zmm12, zmm13
162	vpxorq zmm7, zmm7, zmm11
163
164	vmovdqu8 zmm17, [arg2+16*12]
165	vpshufb zmm17, zmm17, zmm18
166	vpclmulqdq zmm14, zmm8, zmm16, 0x00
167	vpclmulqdq zmm15, zmm8, zmm16, 0x11
168	vpxorq zmm8, zmm14, zmm15
169	vpxorq zmm8, zmm8, zmm17
170
171	sub arg3, 256
172	jge .fold_256_B_loop
173
174	;; Fold 256 into 128
175	add arg2, 256
176	vpclmulqdq zmm1, zmm0, zmm10, 0x00
177	vpclmulqdq zmm2, zmm0, zmm10, 0x11
178	vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
179
180	vpclmulqdq zmm5, zmm4, zmm10, 0x00
181	vpclmulqdq zmm6, zmm4, zmm10, 0x11
182	vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
183
184	vmovdqa32 zmm0, zmm7
185	vmovdqa32 zmm4, zmm8
186
187	add arg3, 128
188	jmp .fold_128_B_register
189
190
191
192	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
193	; loop will fold 128B at a time until we have 128+y Bytes of buffer
194
195	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
196	.fold_128_B_loop:
197	add arg2, 128
198	vmovdqu8 zmm8, [arg2+16*0]
199	vpshufb zmm8, zmm8, zmm18
200	vpclmulqdq zmm2, zmm0, zmm10, 0x00
201	vpclmulqdq zmm1, zmm0, zmm10, 0x11
202	vpxorq zmm0, zmm2, zmm1
203	vpxorq zmm0, zmm0, zmm8
204
205	vmovdqu8 zmm9, [arg2+16*4]
206	vpshufb zmm9, zmm9, zmm18
207	vpclmulqdq zmm5, zmm4, zmm10, 0x00
208	vpclmulqdq zmm6, zmm4, zmm10, 0x11
209	vpxorq zmm4, zmm5, zmm6
210	vpxorq zmm4, zmm4, zmm9
211
212	sub arg3, 128
213	jge .fold_128_B_loop
214	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
215
216	add arg2, 128
217	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
218	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
219
220	.fold_128_B_register:
221	; fold the 8 128b parts into 1 xmm register with different constants
222	vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
223	vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
224	vpclmulqdq zmm1, zmm0, zmm16, 0x00
225	vpclmulqdq zmm2, zmm0, zmm16, 0x11
226	vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
227
228	vpclmulqdq zmm5, zmm4, zmm11, 0x00
229	vpclmulqdq zmm6, zmm4, zmm11, 0x11
230	vmovdqa xmm10, [rk1] ; Needed later in reduction loop
231	vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
232	vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
233
234	vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
235	vpxorq ymm8, ymm8, ymm1
236	vextracti64x2 xmm5, ymm8, 1
237	vpxorq xmm7, xmm5, xmm8
238
239	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
240	; instead of a cmp instruction, we use the negative flag with the jl instruction
241	add arg3, 128-16
242	jl .final_reduction_for_128
243
244	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
245	; we can fold 16 bytes at a time if y>=16
246	; continue folding 16B at a time
247
248	.16B_reduction_loop:
249	vpclmulqdq xmm8, xmm7, xmm10, 0x11
250	vpclmulqdq xmm7, xmm7, xmm10, 0x00
251	vpxor xmm7, xmm8
252	vmovdqu xmm0, [arg2]
253	vpshufb xmm0, xmm0, xmm18
254	vpxor xmm7, xmm0
255	add arg2, 16
256	sub arg3, 16
257	; instead of a cmp instruction, we utilize the flags with the jge instruction
258	; equivalent of: cmp arg3, 16-16
259	; check if there is any more 16B in the buffer to be able to fold
260	jge .16B_reduction_loop
261
262	;now we have 16+z bytes left to reduce, where 0<= z < 16.
263	;first, we reduce the data in the xmm7 register
264
265
266	.final_reduction_for_128:
267	add arg3, 16
268	je .128_done
269
270	; here we are getting data that is less than 16 bytes.
271	; since we know that there was data before the pointer, we can offset
272	; the input pointer before the actual point, to receive exactly 16 bytes.
273	; after that the registers need to be adjusted.
274	.get_last_two_xmms:
275
276	vmovdqa xmm2, xmm7
277	vmovdqu xmm1, [arg2 - 16 + arg3]
278	vpshufb xmm1, xmm18
279
280	; get rid of the extra data that was loaded before
281	; load the shift constant
282	lea rax, [pshufb_shf_table + 16]
283	sub rax, arg3
284	vmovdqu xmm0, [rax]
285
286	vpshufb xmm2, xmm0
287	vpxor xmm0, [mask1]
288	vpshufb xmm7, xmm0
289	vpblendvb xmm1, xmm1, xmm2, xmm0
290
291	vpclmulqdq xmm8, xmm7, xmm10, 0x11
292	vpclmulqdq xmm7, xmm7, xmm10, 0x00
293	vpxor xmm7, xmm8
294	vpxor xmm7, xmm1
295
296	.128_done:
297	; compute crc of a 128-bit value
298	vmovdqa xmm10, [rk5]
299	vmovdqa xmm0, xmm7
300
301	;64b fold
302	vpclmulqdq xmm7, xmm10, 0x01 ; H*L
303	vpslldq xmm0, 8
304	vpxor xmm7, xmm0
305
306	;32b fold
307	vmovdqa xmm0, xmm7
308	vpand xmm0, [mask2]
309	vpsrldq xmm7, 12
310	vpclmulqdq xmm7, xmm10, 0x10
311	vpxor xmm7, xmm0
312
313	;barrett reduction
314	.barrett:
315	vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
316	vmovdqa xmm0, xmm7
317	vpclmulqdq xmm7, xmm10, 0x01
318	vpslldq xmm7, 4
319	vpclmulqdq xmm7, xmm10, 0x11
320
321	vpslldq xmm7, 4
322	vpxor xmm7, xmm0
323	vpextrd eax, xmm7, 1
324
325	.cleanup:
326	; scale the result back to 16 bits
327	shr eax, 16
328
329	%ifidn __OUTPUT_FORMAT__, win64
330	vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
331	vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
332	vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
333	vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
334	vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
335	vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
336	vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
337	vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
338	vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
339	vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
340	%endif
341	add rsp, VARIABLE_OFFSET
342	ret
343
344
345	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349
350	align 16
351	.less_than_256:
352
353	; check if there is enough buffer to be able to fold 16B at a time
354	cmp arg3, 32
355	jl .less_than_32
356
357	; if there is, load the constants
358	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
359
360	vmovd xmm0, arg1_low32 ; get the initial crc value
361	vpslldq xmm0, 12 ; align it to its correct place
362	vmovdqu xmm7, [arg2] ; load the plaintext
363	vpshufb xmm7, xmm18 ; byte-reflect the plaintext
364	vpxor xmm7, xmm0
365
366	; update the buffer pointer
367	add arg2, 16
368
369	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
370	sub arg3, 32
371
372	jmp .16B_reduction_loop
373
374
375	align 16
376	.less_than_32:
377	; mov initial crc to the return value. this is necessary for zero-length buffers.
378	mov eax, arg1_low32
379	test arg3, arg3
380	je .cleanup
381
382	vmovd xmm0, arg1_low32 ; get the initial crc value
383	vpslldq xmm0, 12 ; align it to its correct place
384
385	cmp arg3, 16
386	je .exact_16_left
387	jl .less_than_16_left
388
389	vmovdqu xmm7, [arg2] ; load the plaintext
390	vpshufb xmm7, xmm18
391	vpxor xmm7, xmm0 ; xor the initial crc value
392	add arg2, 16
393	sub arg3, 16
394	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
395	jmp .get_last_two_xmms
396
397	align 16
398	.less_than_16_left:
399	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
400
401	vpxor xmm1, xmm1
402	mov r11, rsp
403	vmovdqa [r11], xmm1
404
405	cmp arg3, 4
406	jl .only_less_than_4
407
408	; backup the counter value
409	mov r9, arg3
410	cmp arg3, 8
411	jl .less_than_8_left
412
413	; load 8 Bytes
414	mov rax, [arg2]
415	mov [r11], rax
416	add r11, 8
417	sub arg3, 8
418	add arg2, 8
419	.less_than_8_left:
420
421	cmp arg3, 4
422	jl .less_than_4_left
423
424	; load 4 Bytes
425	mov eax, [arg2]
426	mov [r11], eax
427	add r11, 4
428	sub arg3, 4
429	add arg2, 4
430	.less_than_4_left:
431
432	cmp arg3, 2
433	jl .less_than_2_left
434
435	; load 2 Bytes
436	mov ax, [arg2]
437	mov [r11], ax
438	add r11, 2
439	sub arg3, 2
440	add arg2, 2
441	.less_than_2_left:
442	cmp arg3, 1
443	jl .zero_left
444
445	; load 1 Byte
446	mov al, [arg2]
447	mov [r11], al
448
449	.zero_left:
450	vmovdqa xmm7, [rsp]
451	vpshufb xmm7, xmm18
452	vpxor xmm7, xmm0 ; xor the initial crc value
453
454	lea rax, [pshufb_shf_table + 16]
455	sub rax, r9
456	vmovdqu xmm0, [rax]
457	vpxor xmm0, [mask1]
458
459	vpshufb xmm7,xmm0
460	jmp .128_done
461
462	align 16
463	.exact_16_left:
464	vmovdqu xmm7, [arg2]
465	vpshufb xmm7, xmm18
466	vpxor xmm7, xmm0 ; xor the initial crc value
467	jmp .128_done
468
469	.only_less_than_4:
470	cmp arg3, 3
471	jl .only_less_than_3
472
473	; load 3 Bytes
474	mov al, [arg2]
475	mov [r11], al
476
477	mov al, [arg2+1]
478	mov [r11+1], al
479
480	mov al, [arg2+2]
481	mov [r11+2], al
482
483	vmovdqa xmm7, [rsp]
484	vpshufb xmm7, xmm18
485	vpxor xmm7, xmm0 ; xor the initial crc value
486
487	vpsrldq xmm7, 5
488	jmp .barrett
489
490	.only_less_than_3:
491	cmp arg3, 2
492	jl .only_less_than_2
493
494	; load 2 Bytes
495	mov al, [arg2]
496	mov [r11], al
497
498	mov al, [arg2+1]
499	mov [r11+1], al
500
501	vmovdqa xmm7, [rsp]
502	vpshufb xmm7, xmm18
503	vpxor xmm7, xmm0 ; xor the initial crc value
504
505	vpsrldq xmm7, 6
506	jmp .barrett
507
508	.only_less_than_2:
509	; load 1 Byte
510	mov al, [arg2]
511	mov [r11], al
512
513	vmovdqa xmm7, [rsp]
514	vpshufb xmm7, xmm18
515	vpxor xmm7, xmm0 ; xor the initial crc value
516
517	vpsrldq xmm7, 7
518	jmp .barrett
519
520	section .data
521	align 32
522
523	%ifndef USE_CONSTS
524	; precomputed constants
525
526	rk_1: dq 0xdccf000000000000
527	rk_2: dq 0x4b0b000000000000
528	rk1: dq 0x2d56000000000000
529	rk2: dq 0x06df000000000000
530	rk3: dq 0x9d9d000000000000
531	rk4: dq 0x7cf5000000000000
532	rk5: dq 0x2d56000000000000
533	rk6: dq 0x1368000000000000
534	rk7: dq 0x00000001f65a57f8
535	rk8: dq 0x000000018bb70000
536	rk9: dq 0xceae000000000000
537	rk10: dq 0xbfd6000000000000
538	rk11: dq 0x1e16000000000000
539	rk12: dq 0x713c000000000000
540	rk13: dq 0xf7f9000000000000
541	rk14: dq 0x80a6000000000000
542	rk15: dq 0x044c000000000000
543	rk16: dq 0xe658000000000000
544	rk17: dq 0xad18000000000000
545	rk18: dq 0xa497000000000000
546	rk19: dq 0x6ee3000000000000
547	rk20: dq 0xe7b5000000000000
548
549	rk_1b: dq 0x2d56000000000000
550	rk_2b: dq 0x06df000000000000
551	dq 0x0000000000000000
552	dq 0x0000000000000000
553	%else
554	INCLUDE_CONSTS
555	%endif
556
557	mask1: dq 0x8080808080808080, 0x8080808080808080
558	mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
559
560	SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
561
562	pshufb_shf_table:
563	; use these values for shift constants for the pshufb instruction
564	; different alignments result in values as shown:
565	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
566	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
567	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
568	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
569	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
570	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
571	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
572	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
573	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
574	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
575	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
576	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
577	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
578	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
579	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
580	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
581	dq 0x0706050403020100, 0x000e0d0c0b0a0908
582	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
583	dq 0x8080808080808080, 0x8080808080808080
584
585	%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
586	%ifidn __OUTPUT_FORMAT__, win64
587	global no_ %+ FUNCTION_NAME
588	no_ %+ FUNCTION_NAME %+ :
589	%endif
590	%endif ; (AS_FEATURE_LEVEL) >= 10