[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by16_10.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc32_gzip_refl_by16_10(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
;
;

%include "reg_sizes.asm"

%ifndef FUNCTION_NAME
%define FUNCTION_NAME crc32_ieee_by16_10
%endif

%if (AS_FEATURE_LEVEL) >= 10

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
	%xdefine	arg1 rcx
	%xdefine	arg2 rdx
	%xdefine	arg3 r8

	%xdefine	arg1_low32 ecx
%else
	%xdefine	arg1 rdi
	%xdefine	arg2 rsi
	%xdefine	arg3 rdx

	%xdefine	arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*12+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
FUNCTION_NAME:

	not		arg1_low32
	sub		rsp, VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
%endif

	vbroadcasti32x4 zmm18, [SHUF_MASK]
	cmp		arg3, 256
	jl		.less_than_256

	; load the initial crc value
	vmovd		xmm10, arg1_low32      ; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	vpslldq		xmm10, 12

	; receive the initial 64B data, xor the initial crc value
	vmovdqu8	zmm0, [arg2+16*0]
	vmovdqu8	zmm4, [arg2+16*4]
	vpshufb		zmm0, zmm0, zmm18
	vpshufb		zmm4, zmm4, zmm18
	vpxorq		zmm0, zmm10
	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use

	sub		arg3, 256
	cmp		arg3, 256
	jl		.fold_128_B_loop

	vmovdqu8	zmm7, [arg2+16*8]
	vmovdqu8	zmm8, [arg2+16*12]
	vpshufb		zmm7, zmm7, zmm18
	vpshufb		zmm8, zmm8, zmm18
	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
	sub		arg3, 256

.fold_256_B_loop:
	add		arg2, 256
	vmovdqu8	zmm3, [arg2+16*0]
	vpshufb		zmm3, zmm3, zmm18
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vpxorq		zmm0, zmm1, zmm2
	vpxorq		zmm0, zmm0, zmm3

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm16, 0x00
	vpclmulqdq	zmm6, zmm4, zmm16, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	vmovdqu8	zmm11, [arg2+16*8]
	vpshufb		zmm11, zmm11, zmm18
	vpclmulqdq	zmm12, zmm7, zmm16, 0x00
	vpclmulqdq	zmm13, zmm7, zmm16, 0x11
	vpxorq		zmm7, zmm12, zmm13
	vpxorq		zmm7, zmm7, zmm11

	vmovdqu8	zmm17, [arg2+16*12]
	vpshufb		zmm17, zmm17, zmm18
	vpclmulqdq	zmm14, zmm8, zmm16, 0x00
	vpclmulqdq	zmm15, zmm8, zmm16, 0x11
	vpxorq		zmm8, zmm14, zmm15
	vpxorq		zmm8, zmm8, zmm17

	sub		arg3, 256
	jge     	.fold_256_B_loop

	;; Fold 256 into 128
	add		arg2, 256
	vpclmulqdq	zmm1, zmm0, zmm10, 0x00
	vpclmulqdq	zmm2, zmm0, zmm10, 0x11
	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC

	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC

	vmovdqa32	zmm0, zmm7
	vmovdqa32	zmm4, zmm8

	add		arg3, 128
	jmp		.fold_128_B_register


	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer

	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
.fold_128_B_loop:
	add		arg2, 128
	vmovdqu8	zmm8, [arg2+16*0]
	vpshufb		zmm8, zmm8, zmm18
	vpclmulqdq	zmm2, zmm0, zmm10, 0x00
	vpclmulqdq	zmm1, zmm0, zmm10, 0x11
	vpxorq		zmm0, zmm2, zmm1
	vpxorq		zmm0, zmm0, zmm8

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	sub		arg3, 128
	jge		.fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add		arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

.fold_128_B_register:
	; fold the 8 128b parts into 1 xmm register with different constants
	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand

	vpclmulqdq	zmm5, zmm4, zmm11, 0x00
	vpclmulqdq	zmm6, zmm4, zmm11, 0x11
	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC

	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
	vpxorq          ymm8, ymm8, ymm1
	vextracti64x2   xmm5, ymm8, 1
	vpxorq          xmm7, xmm5, xmm8

	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add		arg3, 128-16
	jl		.final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

.16B_reduction_loop:
	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vmovdqu		xmm0, [arg2]
	vpshufb		xmm0, xmm0, xmm18
	vpxor		xmm7, xmm0
	add		arg2, 16
	sub		arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge		.16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


.final_reduction_for_128:
	add		arg3, 16
	je		.128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
.get_last_two_xmms:

	vmovdqa		xmm2, xmm7
	vmovdqu		xmm1, [arg2 - 16 + arg3]
	vpshufb		xmm1, xmm18

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea		rax, [pshufb_shf_table + 16]
	sub		rax, arg3
	vmovdqu		xmm0, [rax]

	vpshufb		xmm2, xmm0
	vpxor		xmm0, [mask1]
	vpshufb		xmm7, xmm0
	vpblendvb	xmm1, xmm1, xmm2, xmm0

	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vpxor		xmm7, xmm1

.128_done:
	; compute crc of a 128-bit value
	vmovdqa		xmm10, [rk5]
	vmovdqa		xmm0, xmm7

	;64b fold
	vpclmulqdq	xmm7, xmm10, 0x01	; H*L
	vpslldq		xmm0, 8
	vpxor		xmm7, xmm0

	;32b fold
	vmovdqa		xmm0, xmm7
	vpand		xmm0, [mask2]
	vpsrldq		xmm7, 12
	vpclmulqdq	xmm7, xmm10, 0x10
	vpxor		xmm7, xmm0

	;barrett reduction
.barrett:
	vmovdqa		xmm10, [rk7]	; rk7 and rk8 in xmm10
	vmovdqa		xmm0, xmm7
	vpclmulqdq	xmm7, xmm10, 0x01
	vpslldq		xmm7, 4
	vpclmulqdq	xmm7, xmm10, 0x11

	vpslldq		xmm7, 4
	vpxor		xmm7, xmm0
	vpextrd		eax, xmm7, 1

.cleanup:
	not		eax


%ifidn __OUTPUT_FORMAT__, win64
	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
%endif
	add		rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
.less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	.less_than_32

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place
	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18		; byte-reflect the plaintext
	vpxor	xmm7, xmm0

	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	.16B_reduction_loop


align 16
.less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	.cleanup

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	.exact_16_left
	jl	.less_than_16_left

	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0		; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
	jmp	.get_last_two_xmms

align 16
.less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	vpxor	xmm1, xmm1
	mov	r11, rsp
	vmovdqa	[r11], xmm1

	cmp	arg3, 4
	jl	.only_less_than_4

	; backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	.less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
.less_than_8_left:

	cmp	arg3, 4
	jl	.less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
.less_than_4_left:

	cmp	arg3, 2
	jl	.less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
.less_than_2_left:
	cmp	arg3, 1
	jl	.zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

.zero_left:
	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	vmovdqu	xmm0, [rax]
	vpxor	xmm0, [mask1]

	vpshufb	xmm7,xmm0
	jmp	.128_done

align 16
.exact_16_left:
	vmovdqu	xmm7, [arg2]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value
	jmp	.128_done

.only_less_than_4:
	cmp	arg3, 3
	jl	.only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 5
	jmp	.barrett

.only_less_than_3:
	cmp	arg3, 2
	jl	.only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 6
	jmp	.barrett

.only_less_than_2:
	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value

	vpsrldq	xmm7, 7
	jmp	.barrett

section .data
align 32

%ifndef USE_CONSTS
; precomputed constants
rk_1: dq 0x1851689900000000
rk_2: dq 0xa3dc855100000000
rk1:  dq 0xf200aa6600000000
rk2:  dq 0x17d3315d00000000
rk3:  dq 0x022ffca500000000
rk4:  dq 0x9d9ee22f00000000
rk5:  dq 0xf200aa6600000000
rk6:  dq 0x490d678d00000000
rk7:  dq 0x0000000104d101df
rk8:  dq 0x0000000104c11db7
rk9:  dq 0x6ac7e7d700000000
rk10: dq 0xfcd922af00000000
rk11: dq 0x34e45a6300000000
rk12: dq 0x8762c1f600000000
rk13: dq 0x5395a0ea00000000
rk14: dq 0x54f2d5c700000000
rk15: dq 0xd3504ec700000000
rk16: dq 0x57a8445500000000
rk17: dq 0xc053585d00000000
rk18: dq 0x766f1b7800000000
rk19: dq 0xcd8c54b500000000
rk20: dq 0xab40b71e00000000

rk_1b: dq 0xf200aa6600000000
rk_2b: dq 0x17d3315d00000000
	dq 0x0000000000000000
	dq 0x0000000000000000
%else
INCLUDE_CONSTS
%endif

mask1: dq 0x8080808080808080, 0x8080808080808080
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
%ifidn __OUTPUT_FORMAT__, win64
global no_ %+ FUNCTION_NAME
no_ %+ FUNCTION_NAME %+ :
%endif
%endif ; (AS_FEATURE_LEVEL) >= 10
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; UINT32 crc32_gzip_refl_by16_10(
	33	; UINT32 init_crc, //initial CRC value, 32 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; UINT64 len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	; Authors:
	39	; Erdinc Ozturk
	40	; Vinodh Gopal
	41	; James Guilford
	42	;
	43	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	44	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	45	;
	46	;
	47
	48	%include "reg_sizes.asm"
	49
	50	%ifndef FUNCTION_NAME
	51	%define FUNCTION_NAME crc32_ieee_by16_10
	52	%endif
	53
	54	%if (AS_FEATURE_LEVEL) >= 10
	55
	56	[bits 64]
	57	default rel
	58
	59	section .text
	60
	61
	62	%ifidn __OUTPUT_FORMAT__, win64
	63	%xdefine arg1 rcx
	64	%xdefine arg2 rdx
65	%xdefine arg3 r8
66
67	%xdefine arg1_low32 ecx
68	%else
69	%xdefine arg1 rdi
70	%xdefine arg2 rsi
71	%xdefine arg3 rdx
72
73	%xdefine arg1_low32 edi
74	%endif
75
76	%define TMP 16*0
77	%ifidn __OUTPUT_FORMAT__, win64
78	%define XMM_SAVE 16*2
79	%define VARIABLE_OFFSET 16*12+8
80	%else
81	%define VARIABLE_OFFSET 16*2+8
82	%endif
83
84	align 16
85	global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
86	FUNCTION_NAME:
87
88	not arg1_low32
89	sub rsp, VARIABLE_OFFSET
90
91	%ifidn __OUTPUT_FORMAT__, win64
92	; push the xmm registers into the stack to maintain
93	vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
94	vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
95	vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
96	vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
97	vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
98	vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
99	vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
100	vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
101	vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
102	vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
103	%endif
104
105	vbroadcasti32x4 zmm18, [SHUF_MASK]
106	cmp arg3, 256
107	jl .less_than_256
108
109	; load the initial crc value
110	vmovd xmm10, arg1_low32 ; initial crc
111
112	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
113	; because data will be byte-reflected and will align with initial crc at correct place.
114	vpslldq xmm10, 12
115
116	; receive the initial 64B data, xor the initial crc value
117	vmovdqu8 zmm0, [arg2+16*0]
118	vmovdqu8 zmm4, [arg2+16*4]
119	vpshufb zmm0, zmm0, zmm18
120	vpshufb zmm4, zmm4, zmm18
121	vpxorq zmm0, zmm10
122	vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
123	;imm value of pclmulqdq instruction will determine which constant to use
124
125	sub arg3, 256
126	cmp arg3, 256
127	jl .fold_128_B_loop
128
129	vmovdqu8 zmm7, [arg2+16*8]
130	vmovdqu8 zmm8, [arg2+16*12]
131	vpshufb zmm7, zmm7, zmm18
132	vpshufb zmm8, zmm8, zmm18
133	vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
134	sub arg3, 256
135
136	.fold_256_B_loop:
137	add arg2, 256
138	vmovdqu8 zmm3, [arg2+16*0]
139	vpshufb zmm3, zmm3, zmm18
140	vpclmulqdq zmm1, zmm0, zmm16, 0x00
141	vpclmulqdq zmm2, zmm0, zmm16, 0x11
142	vpxorq zmm0, zmm1, zmm2
143	vpxorq zmm0, zmm0, zmm3
144
145	vmovdqu8 zmm9, [arg2+16*4]
146	vpshufb zmm9, zmm9, zmm18
147	vpclmulqdq zmm5, zmm4, zmm16, 0x00
148	vpclmulqdq zmm6, zmm4, zmm16, 0x11
149	vpxorq zmm4, zmm5, zmm6
150	vpxorq zmm4, zmm4, zmm9
151
152	vmovdqu8 zmm11, [arg2+16*8]
153	vpshufb zmm11, zmm11, zmm18
154	vpclmulqdq zmm12, zmm7, zmm16, 0x00
155	vpclmulqdq zmm13, zmm7, zmm16, 0x11
156	vpxorq zmm7, zmm12, zmm13
157	vpxorq zmm7, zmm7, zmm11
158
159	vmovdqu8 zmm17, [arg2+16*12]
160	vpshufb zmm17, zmm17, zmm18
161	vpclmulqdq zmm14, zmm8, zmm16, 0x00
162	vpclmulqdq zmm15, zmm8, zmm16, 0x11
163	vpxorq zmm8, zmm14, zmm15
164	vpxorq zmm8, zmm8, zmm17
165
166	sub arg3, 256
167	jge .fold_256_B_loop
168
169	;; Fold 256 into 128
170	add arg2, 256
171	vpclmulqdq zmm1, zmm0, zmm10, 0x00
172	vpclmulqdq zmm2, zmm0, zmm10, 0x11
173	vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
174
175	vpclmulqdq zmm5, zmm4, zmm10, 0x00
176	vpclmulqdq zmm6, zmm4, zmm10, 0x11
177	vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
178
179	vmovdqa32 zmm0, zmm7
180	vmovdqa32 zmm4, zmm8
181
182	add arg3, 128
183	jmp .fold_128_B_register
184
185
186
187	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
188	; loop will fold 128B at a time until we have 128+y Bytes of buffer
189
190	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
191	.fold_128_B_loop:
192	add arg2, 128
193	vmovdqu8 zmm8, [arg2+16*0]
194	vpshufb zmm8, zmm8, zmm18
195	vpclmulqdq zmm2, zmm0, zmm10, 0x00
196	vpclmulqdq zmm1, zmm0, zmm10, 0x11
197	vpxorq zmm0, zmm2, zmm1
198	vpxorq zmm0, zmm0, zmm8
199
200	vmovdqu8 zmm9, [arg2+16*4]
201	vpshufb zmm9, zmm9, zmm18
202	vpclmulqdq zmm5, zmm4, zmm10, 0x00
203	vpclmulqdq zmm6, zmm4, zmm10, 0x11
204	vpxorq zmm4, zmm5, zmm6
205	vpxorq zmm4, zmm4, zmm9
206
207	sub arg3, 128
208	jge .fold_128_B_loop
209	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
210
211	add arg2, 128
212	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
213	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
214
215	.fold_128_B_register:
216	; fold the 8 128b parts into 1 xmm register with different constants
217	vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
218	vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
219	vpclmulqdq zmm1, zmm0, zmm16, 0x00
220	vpclmulqdq zmm2, zmm0, zmm16, 0x11
221	vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
222
223	vpclmulqdq zmm5, zmm4, zmm11, 0x00
224	vpclmulqdq zmm6, zmm4, zmm11, 0x11
225	vmovdqa xmm10, [rk1] ; Needed later in reduction loop
226	vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
227	vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
228
229	vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
230	vpxorq ymm8, ymm8, ymm1
231	vextracti64x2 xmm5, ymm8, 1
232	vpxorq xmm7, xmm5, xmm8
233
234	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
235	; instead of a cmp instruction, we use the negative flag with the jl instruction
236	add arg3, 128-16
237	jl .final_reduction_for_128
238
239	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
240	; we can fold 16 bytes at a time if y>=16
241	; continue folding 16B at a time
242
243	.16B_reduction_loop:
244	vpclmulqdq xmm8, xmm7, xmm10, 0x11
245	vpclmulqdq xmm7, xmm7, xmm10, 0x00
246	vpxor xmm7, xmm8
247	vmovdqu xmm0, [arg2]
248	vpshufb xmm0, xmm0, xmm18
249	vpxor xmm7, xmm0
250	add arg2, 16
251	sub arg3, 16
252	; instead of a cmp instruction, we utilize the flags with the jge instruction
253	; equivalent of: cmp arg3, 16-16
254	; check if there is any more 16B in the buffer to be able to fold
255	jge .16B_reduction_loop
256
257	;now we have 16+z bytes left to reduce, where 0<= z < 16.
258	;first, we reduce the data in the xmm7 register
259
260
261	.final_reduction_for_128:
262	add arg3, 16
263	je .128_done
264
265	; here we are getting data that is less than 16 bytes.
266	; since we know that there was data before the pointer, we can offset
267	; the input pointer before the actual point, to receive exactly 16 bytes.
268	; after that the registers need to be adjusted.
269	.get_last_two_xmms:
270
271	vmovdqa xmm2, xmm7
272	vmovdqu xmm1, [arg2 - 16 + arg3]
273	vpshufb xmm1, xmm18
274
275	; get rid of the extra data that was loaded before
276	; load the shift constant
277	lea rax, [pshufb_shf_table + 16]
278	sub rax, arg3
279	vmovdqu xmm0, [rax]
280
281	vpshufb xmm2, xmm0
282	vpxor xmm0, [mask1]
283	vpshufb xmm7, xmm0
284	vpblendvb xmm1, xmm1, xmm2, xmm0
285
286	vpclmulqdq xmm8, xmm7, xmm10, 0x11
287	vpclmulqdq xmm7, xmm7, xmm10, 0x00
288	vpxor xmm7, xmm8
289	vpxor xmm7, xmm1
290
291	.128_done:
292	; compute crc of a 128-bit value
293	vmovdqa xmm10, [rk5]
294	vmovdqa xmm0, xmm7
295
296	;64b fold
297	vpclmulqdq xmm7, xmm10, 0x01 ; H*L
298	vpslldq xmm0, 8
299	vpxor xmm7, xmm0
300
301	;32b fold
302	vmovdqa xmm0, xmm7
303	vpand xmm0, [mask2]
304	vpsrldq xmm7, 12
305	vpclmulqdq xmm7, xmm10, 0x10
306	vpxor xmm7, xmm0
307
308	;barrett reduction
309	.barrett:
310	vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
311	vmovdqa xmm0, xmm7
312	vpclmulqdq xmm7, xmm10, 0x01
313	vpslldq xmm7, 4
314	vpclmulqdq xmm7, xmm10, 0x11
315
316	vpslldq xmm7, 4
317	vpxor xmm7, xmm0
318	vpextrd eax, xmm7, 1
319
320	.cleanup:
321	not eax
322
323
324	%ifidn __OUTPUT_FORMAT__, win64
325	vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
326	vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
327	vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
328	vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
329	vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
330	vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
331	vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
332	vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
333	vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
334	vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
335	%endif
336	add rsp, VARIABLE_OFFSET
337	ret
338
339
340	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
341	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344
345	align 16
346	.less_than_256:
347
348	; check if there is enough buffer to be able to fold 16B at a time
349	cmp arg3, 32
350	jl .less_than_32
351
352	; if there is, load the constants
353	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
354
355	vmovd xmm0, arg1_low32 ; get the initial crc value
356	vpslldq xmm0, 12 ; align it to its correct place
357	vmovdqu xmm7, [arg2] ; load the plaintext
358	vpshufb xmm7, xmm18 ; byte-reflect the plaintext
359	vpxor xmm7, xmm0
360
361	; update the buffer pointer
362	add arg2, 16
363
364	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
365	sub arg3, 32
366
367	jmp .16B_reduction_loop
368
369
370	align 16
371	.less_than_32:
372	; mov initial crc to the return value. this is necessary for zero-length buffers.
373	mov eax, arg1_low32
374	test arg3, arg3
375	je .cleanup
376
377	vmovd xmm0, arg1_low32 ; get the initial crc value
378	vpslldq xmm0, 12 ; align it to its correct place
379
380	cmp arg3, 16
381	je .exact_16_left
382	jl .less_than_16_left
383
384	vmovdqu xmm7, [arg2] ; load the plaintext
385	vpshufb xmm7, xmm18
386	vpxor xmm7, xmm0 ; xor the initial crc value
387	add arg2, 16
388	sub arg3, 16
389	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
390	jmp .get_last_two_xmms
391
392	align 16
393	.less_than_16_left:
394	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
395
396	vpxor xmm1, xmm1
397	mov r11, rsp
398	vmovdqa [r11], xmm1
399
400	cmp arg3, 4
401	jl .only_less_than_4
402
403	; backup the counter value
404	mov r9, arg3
405	cmp arg3, 8
406	jl .less_than_8_left
407
408	; load 8 Bytes
409	mov rax, [arg2]
410	mov [r11], rax
411	add r11, 8
412	sub arg3, 8
413	add arg2, 8
414	.less_than_8_left:
415
416	cmp arg3, 4
417	jl .less_than_4_left
418
419	; load 4 Bytes
420	mov eax, [arg2]
421	mov [r11], eax
422	add r11, 4
423	sub arg3, 4
424	add arg2, 4
425	.less_than_4_left:
426
427	cmp arg3, 2
428	jl .less_than_2_left
429
430	; load 2 Bytes
431	mov ax, [arg2]
432	mov [r11], ax
433	add r11, 2
434	sub arg3, 2
435	add arg2, 2
436	.less_than_2_left:
437	cmp arg3, 1
438	jl .zero_left
439
440	; load 1 Byte
441	mov al, [arg2]
442	mov [r11], al
443
444	.zero_left:
445	vmovdqa xmm7, [rsp]
446	vpshufb xmm7, xmm18
447	vpxor xmm7, xmm0 ; xor the initial crc value
448
449	lea rax, [pshufb_shf_table + 16]
450	sub rax, r9
451	vmovdqu xmm0, [rax]
452	vpxor xmm0, [mask1]
453
454	vpshufb xmm7,xmm0
455	jmp .128_done
456
457	align 16
458	.exact_16_left:
459	vmovdqu xmm7, [arg2]
460	vpshufb xmm7, xmm18
461	vpxor xmm7, xmm0 ; xor the initial crc value
462	jmp .128_done
463
464	.only_less_than_4:
465	cmp arg3, 3
466	jl .only_less_than_3
467
468	; load 3 Bytes
469	mov al, [arg2]
470	mov [r11], al
471
472	mov al, [arg2+1]
473	mov [r11+1], al
474
475	mov al, [arg2+2]
476	mov [r11+2], al
477
478	vmovdqa xmm7, [rsp]
479	vpshufb xmm7, xmm18
480	vpxor xmm7, xmm0 ; xor the initial crc value
481
482	vpsrldq xmm7, 5
483	jmp .barrett
484
485	.only_less_than_3:
486	cmp arg3, 2
487	jl .only_less_than_2
488
489	; load 2 Bytes
490	mov al, [arg2]
491	mov [r11], al
492
493	mov al, [arg2+1]
494	mov [r11+1], al
495
496	vmovdqa xmm7, [rsp]
497	vpshufb xmm7, xmm18
498	vpxor xmm7, xmm0 ; xor the initial crc value
499
500	vpsrldq xmm7, 6
501	jmp .barrett
502
503	.only_less_than_2:
504	; load 1 Byte
505	mov al, [arg2]
506	mov [r11], al
507
508	vmovdqa xmm7, [rsp]
509	vpshufb xmm7, xmm18
510	vpxor xmm7, xmm0 ; xor the initial crc value
511
512	vpsrldq xmm7, 7
513	jmp .barrett
514
515	section .data
516	align 32
517
518	%ifndef USE_CONSTS
519	; precomputed constants
520	rk_1: dq 0x1851689900000000
521	rk_2: dq 0xa3dc855100000000
522	rk1: dq 0xf200aa6600000000
523	rk2: dq 0x17d3315d00000000
524	rk3: dq 0x022ffca500000000
525	rk4: dq 0x9d9ee22f00000000
526	rk5: dq 0xf200aa6600000000
527	rk6: dq 0x490d678d00000000
528	rk7: dq 0x0000000104d101df
529	rk8: dq 0x0000000104c11db7
530	rk9: dq 0x6ac7e7d700000000
531	rk10: dq 0xfcd922af00000000
532	rk11: dq 0x34e45a6300000000
533	rk12: dq 0x8762c1f600000000
534	rk13: dq 0x5395a0ea00000000
535	rk14: dq 0x54f2d5c700000000
536	rk15: dq 0xd3504ec700000000
537	rk16: dq 0x57a8445500000000
538	rk17: dq 0xc053585d00000000
539	rk18: dq 0x766f1b7800000000
540	rk19: dq 0xcd8c54b500000000
541	rk20: dq 0xab40b71e00000000
542
543	rk_1b: dq 0xf200aa6600000000
544	rk_2b: dq 0x17d3315d00000000
545	dq 0x0000000000000000
546	dq 0x0000000000000000
547	%else
548	INCLUDE_CONSTS
549	%endif
550
551	mask1: dq 0x8080808080808080, 0x8080808080808080
552	mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
553
554	SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
555
556	pshufb_shf_table:
557	; use these values for shift constants for the pshufb instruction
558	; different alignments result in values as shown:
559	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
560	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
561	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
562	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
563	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
564	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
565	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
566	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
567	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
568	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
569	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
570	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
571	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
572	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
573	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
574	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
575	dq 0x0706050403020100, 0x000e0d0c0b0a0908
576	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
577	dq 0x8080808080808080, 0x8080808080808080
578
579	%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
580	%ifidn __OUTPUT_FORMAT__, win64
581	global no_ %+ FUNCTION_NAME
582	no_ %+ FUNCTION_NAME %+ :
583	%endif
584	%endif ; (AS_FEATURE_LEVEL) >= 10