[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_by16_10.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc32_gzip_refl_by16_10(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
;
;

%include "reg_sizes.asm"

%ifndef FUNCTION_NAME
%define FUNCTION_NAME crc32_ieee_by16_10
%endif

%if (AS_FEATURE_LEVEL) >= 10

[bits 64]
default rel

section .text


%ifidn __OUTPUT_FORMAT__, win64
	%xdefine	arg1 rcx
	%xdefine	arg2 rdx
	%xdefine	arg3 r8

	%xdefine	arg1_low32 ecx
%else
	%xdefine	arg1 rdi
	%xdefine	arg2 rsi
	%xdefine	arg3 rdx

	%xdefine	arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
	%define XMM_SAVE 16*2
	%define VARIABLE_OFFSET 16*12+8
%else
	%define VARIABLE_OFFSET 16*2+8
%endif

align 16
mk_global FUNCTION_NAME, function
FUNCTION_NAME:
	endbranch

	not		arg1_low32
	sub		rsp, VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
	; push the xmm registers into the stack to maintain
	vmovdqa		[rsp + XMM_SAVE + 16*0], xmm6
	vmovdqa		[rsp + XMM_SAVE + 16*1], xmm7
	vmovdqa		[rsp + XMM_SAVE + 16*2], xmm8
	vmovdqa		[rsp + XMM_SAVE + 16*3], xmm9
	vmovdqa		[rsp + XMM_SAVE + 16*4], xmm10
	vmovdqa		[rsp + XMM_SAVE + 16*5], xmm11
	vmovdqa		[rsp + XMM_SAVE + 16*6], xmm12
	vmovdqa		[rsp + XMM_SAVE + 16*7], xmm13
	vmovdqa		[rsp + XMM_SAVE + 16*8], xmm14
	vmovdqa		[rsp + XMM_SAVE + 16*9], xmm15
%endif

	vbroadcasti32x4 zmm18, [SHUF_MASK]
	cmp		arg3, 256
	jl		.less_than_256

	; load the initial crc value
	vmovd		xmm10, arg1_low32      ; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	vpslldq		xmm10, 12

	; receive the initial 64B data, xor the initial crc value
	vmovdqu8	zmm0, [arg2+16*0]
	vmovdqu8	zmm4, [arg2+16*4]
	vpshufb		zmm0, zmm0, zmm18
	vpshufb		zmm4, zmm4, zmm18
	vpxorq		zmm0, zmm10
	vbroadcasti32x4	zmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use

	sub		arg3, 256
	cmp		arg3, 256
	jl		.fold_128_B_loop

	vmovdqu8	zmm7, [arg2+16*8]
	vmovdqu8	zmm8, [arg2+16*12]
	vpshufb		zmm7, zmm7, zmm18
	vpshufb		zmm8, zmm8, zmm18
	vbroadcasti32x4 zmm16, [rk_1]	;zmm16 has rk-1 and rk-2
	sub		arg3, 256

.fold_256_B_loop:
	add		arg2, 256
	vmovdqu8	zmm3, [arg2+16*0]
	vpshufb		zmm3, zmm3, zmm18
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vpxorq		zmm0, zmm1, zmm2
	vpxorq		zmm0, zmm0, zmm3

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm16, 0x00
	vpclmulqdq	zmm6, zmm4, zmm16, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	vmovdqu8	zmm11, [arg2+16*8]
	vpshufb		zmm11, zmm11, zmm18
	vpclmulqdq	zmm12, zmm7, zmm16, 0x00
	vpclmulqdq	zmm13, zmm7, zmm16, 0x11
	vpxorq		zmm7, zmm12, zmm13
	vpxorq		zmm7, zmm7, zmm11

	vmovdqu8	zmm17, [arg2+16*12]
	vpshufb		zmm17, zmm17, zmm18
	vpclmulqdq	zmm14, zmm8, zmm16, 0x00
	vpclmulqdq	zmm15, zmm8, zmm16, 0x11
	vpxorq		zmm8, zmm14, zmm15
	vpxorq		zmm8, zmm8, zmm17

	sub		arg3, 256
	jge     	.fold_256_B_loop

	;; Fold 256 into 128
	add		arg2, 256
	vpclmulqdq	zmm1, zmm0, zmm10, 0x00
	vpclmulqdq	zmm2, zmm0, zmm10, 0x11
	vpternlogq	zmm7, zmm1, zmm2, 0x96	; xor ABC

	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpternlogq	zmm8, zmm5, zmm6, 0x96	; xor ABC

	vmovdqa32	zmm0, zmm7
	vmovdqa32	zmm4, zmm8

	add		arg3, 128
	jmp		.fold_128_B_register


	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer

	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
.fold_128_B_loop:
	add		arg2, 128
	vmovdqu8	zmm8, [arg2+16*0]
	vpshufb		zmm8, zmm8, zmm18
	vpclmulqdq	zmm2, zmm0, zmm10, 0x00
	vpclmulqdq	zmm1, zmm0, zmm10, 0x11
	vpxorq		zmm0, zmm2, zmm1
	vpxorq		zmm0, zmm0, zmm8

	vmovdqu8	zmm9, [arg2+16*4]
	vpshufb		zmm9, zmm9, zmm18
	vpclmulqdq	zmm5, zmm4, zmm10, 0x00
	vpclmulqdq	zmm6, zmm4, zmm10, 0x11
	vpxorq		zmm4, zmm5, zmm6
	vpxorq		zmm4, zmm4, zmm9

	sub		arg3, 128
	jge		.fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add		arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

.fold_128_B_register:
	; fold the 8 128b parts into 1 xmm register with different constants
	vmovdqu8	zmm16, [rk9]		; multiply by rk9-rk16
	vmovdqu8	zmm11, [rk17]		; multiply by rk17-rk20, rk1,rk2, 0,0
	vpclmulqdq	zmm1, zmm0, zmm16, 0x00
	vpclmulqdq	zmm2, zmm0, zmm16, 0x11
	vextracti64x2	xmm7, zmm4, 3		; save last that has no multiplicand

	vpclmulqdq	zmm5, zmm4, zmm11, 0x00
	vpclmulqdq	zmm6, zmm4, zmm11, 0x11
	vmovdqa		xmm10, [rk1]		; Needed later in reduction loop
	vpternlogq	zmm1, zmm2, zmm5, 0x96	; xor ABC
	vpternlogq	zmm1, zmm6, zmm7, 0x96	; xor ABC

	vshufi64x2      zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
	vpxorq          ymm8, ymm8, ymm1
	vextracti64x2   xmm5, ymm8, 1
	vpxorq          xmm7, xmm5, xmm8

	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add		arg3, 128-16
	jl		.final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

.16B_reduction_loop:
	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vmovdqu		xmm0, [arg2]
	vpshufb		xmm0, xmm0, xmm18
	vpxor		xmm7, xmm0
	add		arg2, 16
	sub		arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge		.16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


.final_reduction_for_128:
	add		arg3, 16
	je		.128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset
	; the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
.get_last_two_xmms:

	vmovdqa		xmm2, xmm7
	vmovdqu		xmm1, [arg2 - 16 + arg3]
	vpshufb		xmm1, xmm18

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea		rax, [pshufb_shf_table + 16]
	sub		rax, arg3
	vmovdqu		xmm0, [rax]

	vpshufb		xmm2, xmm0
	vpxor		xmm0, [mask1]
	vpshufb		xmm7, xmm0
	vpblendvb	xmm1, xmm1, xmm2, xmm0

	vpclmulqdq	xmm8, xmm7, xmm10, 0x11
	vpclmulqdq	xmm7, xmm7, xmm10, 0x00
	vpxor		xmm7, xmm8
	vpxor		xmm7, xmm1

.128_done:
	; compute crc of a 128-bit value
	vmovdqa		xmm10, [rk5]
	vmovdqa		xmm0, xmm7

	;64b fold
	vpclmulqdq	xmm7, xmm10, 0x01	; H*L
	vpslldq		xmm0, 8
	vpxor		xmm7, xmm0

	;32b fold
	vmovdqa		xmm0, xmm7
	vpand		xmm0, [mask2]
	vpsrldq		xmm7, 12
	vpclmulqdq	xmm7, xmm10, 0x10
	vpxor		xmm7, xmm0

	;barrett reduction
.barrett:
	vmovdqa		xmm10, [rk7]	; rk7 and rk8 in xmm10
	vmovdqa		xmm0, xmm7
	vpclmulqdq	xmm7, xmm10, 0x01
	vpslldq		xmm7, 4
	vpclmulqdq	xmm7, xmm10, 0x11

	vpslldq		xmm7, 4
	vpxor		xmm7, xmm0
	vpextrd		eax, xmm7, 1

.cleanup:
	not		eax


%ifidn __OUTPUT_FORMAT__, win64
	vmovdqa		xmm6, [rsp + XMM_SAVE + 16*0]
	vmovdqa		xmm7, [rsp + XMM_SAVE + 16*1]
	vmovdqa		xmm8, [rsp + XMM_SAVE + 16*2]
	vmovdqa		xmm9, [rsp + XMM_SAVE + 16*3]
	vmovdqa		xmm10, [rsp + XMM_SAVE + 16*4]
	vmovdqa		xmm11, [rsp + XMM_SAVE + 16*5]
	vmovdqa		xmm12, [rsp + XMM_SAVE + 16*6]
	vmovdqa		xmm13, [rsp + XMM_SAVE + 16*7]
	vmovdqa		xmm14, [rsp + XMM_SAVE + 16*8]
	vmovdqa		xmm15, [rsp + XMM_SAVE + 16*9]
%endif
	add		rsp, VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
.less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	.less_than_32

	; if there is, load the constants
	vmovdqa	xmm10, [rk1]    ; rk1 and rk2 in xmm10

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place
	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18		; byte-reflect the plaintext
	vpxor	xmm7, xmm0

	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	.16B_reduction_loop


align 16
.less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	.cleanup

	vmovd	xmm0, arg1_low32	; get the initial crc value
	vpslldq	xmm0, 12		; align it to its correct place

	cmp	arg3, 16
	je	.exact_16_left
	jl	.less_than_16_left

	vmovdqu	xmm7, [arg2]		; load the plaintext
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0		; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	vmovdqa	xmm10, [rk1]		; rk1 and rk2 in xmm10
	jmp	.get_last_two_xmms

align 16
.less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	vpxor	xmm1, xmm1
	mov	r11, rsp
	vmovdqa	[r11], xmm1

	cmp	arg3, 4
	jl	.only_less_than_4

	; backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	.less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
.less_than_8_left:

	cmp	arg3, 4
	jl	.less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
.less_than_4_left:

	cmp	arg3, 2
	jl	.less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
.less_than_2_left:
	cmp	arg3, 1
	jl	.zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

.zero_left:
	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	vmovdqu	xmm0, [rax]
	vpxor	xmm0, [mask1]

	vpshufb	xmm7,xmm0
	jmp	.128_done

align 16
.exact_16_left:
	vmovdqu	xmm7, [arg2]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value
	jmp	.128_done

.only_less_than_4:
	cmp	arg3, 3
	jl	.only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 5
	jmp	.barrett

.only_less_than_3:
	cmp	arg3, 2
	jl	.only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0	; xor the initial crc value

	vpsrldq	xmm7, 6
	jmp	.barrett

.only_less_than_2:
	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	vmovdqa	xmm7, [rsp]
	vpshufb	xmm7, xmm18
	vpxor	xmm7, xmm0      ; xor the initial crc value

	vpsrldq	xmm7, 7
	jmp	.barrett

section .data
align 32

%ifndef USE_CONSTS
; precomputed constants
rk_1: dq 0x1851689900000000
rk_2: dq 0xa3dc855100000000
rk1:  dq 0xf200aa6600000000
rk2:  dq 0x17d3315d00000000
rk3:  dq 0x022ffca500000000
rk4:  dq 0x9d9ee22f00000000
rk5:  dq 0xf200aa6600000000
rk6:  dq 0x490d678d00000000
rk7:  dq 0x0000000104d101df
rk8:  dq 0x0000000104c11db7
rk9:  dq 0x6ac7e7d700000000
rk10: dq 0xfcd922af00000000
rk11: dq 0x34e45a6300000000
rk12: dq 0x8762c1f600000000
rk13: dq 0x5395a0ea00000000
rk14: dq 0x54f2d5c700000000
rk15: dq 0xd3504ec700000000
rk16: dq 0x57a8445500000000
rk17: dq 0xc053585d00000000
rk18: dq 0x766f1b7800000000
rk19: dq 0xcd8c54b500000000
rk20: dq 0xab40b71e00000000

rk_1b: dq 0xf200aa6600000000
rk_2b: dq 0x17d3315d00000000
	dq 0x0000000000000000
	dq 0x0000000000000000
%else
INCLUDE_CONSTS
%endif

mask1: dq 0x8080808080808080, 0x8080808080808080
mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

%else  ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
%ifidn __OUTPUT_FORMAT__, win64
global no_ %+ FUNCTION_NAME
no_ %+ FUNCTION_NAME %+ :
%endif
%endif ; (AS_FEATURE_LEVEL) >= 10
Commit	Line	Data
f91f0fd5 TL	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	31	; Function API:
	32	; UINT32 crc32_gzip_refl_by16_10(
	33	; UINT32 init_crc, //initial CRC value, 32 bits
	34	; const unsigned char *buf, //buffer pointer to calculate CRC on
	35	; UINT64 len //buffer length in bytes (64-bit data)
	36	; );
	37	;
	38	; Authors:
	39	; Erdinc Ozturk
	40	; Vinodh Gopal
	41	; James Guilford
	42	;
	43	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	44	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	45	;
	46	;
	47
	48	%include "reg_sizes.asm"
	49
	50	%ifndef FUNCTION_NAME
	51	%define FUNCTION_NAME crc32_ieee_by16_10
	52	%endif
	53
	54	%if (AS_FEATURE_LEVEL) >= 10
	55
	56	[bits 64]
	57	default rel
	58
	59	section .text
	60
	61
	62	%ifidn __OUTPUT_FORMAT__, win64
	63	%xdefine arg1 rcx
	64	%xdefine arg2 rdx
65	%xdefine arg3 r8
66
67	%xdefine arg1_low32 ecx
68	%else
69	%xdefine arg1 rdi
70	%xdefine arg2 rsi
71	%xdefine arg3 rdx
72
73	%xdefine arg1_low32 edi
74	%endif
75
76	%define TMP 16*0
77	%ifidn __OUTPUT_FORMAT__, win64
78	%define XMM_SAVE 16*2
79	%define VARIABLE_OFFSET 16*12+8
80	%else
81	%define VARIABLE_OFFSET 16*2+8
82	%endif
83
84	align 16
20effc67	85	mk_global FUNCTION_NAME, function
f91f0fd5	86	FUNCTION_NAME:
20effc67	87	endbranch
f91f0fd5 TL	88
	89	not arg1_low32
	90	sub rsp, VARIABLE_OFFSET
	91
	92	%ifidn __OUTPUT_FORMAT__, win64
	93	; push the xmm registers into the stack to maintain
	94	vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
	95	vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
	96	vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
	97	vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
	98	vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
	99	vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
	100	vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
	101	vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
	102	vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
	103	vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
	104	%endif
	105
	106	vbroadcasti32x4 zmm18, [SHUF_MASK]
	107	cmp arg3, 256
	108	jl .less_than_256
	109
	110	; load the initial crc value
	111	vmovd xmm10, arg1_low32 ; initial crc
	112
	113	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	114	; because data will be byte-reflected and will align with initial crc at correct place.
	115	vpslldq xmm10, 12
	116
	117	; receive the initial 64B data, xor the initial crc value
	118	vmovdqu8 zmm0, [arg2+16*0]
	119	vmovdqu8 zmm4, [arg2+16*4]
	120	vpshufb zmm0, zmm0, zmm18
	121	vpshufb zmm4, zmm4, zmm18
	122	vpxorq zmm0, zmm10
	123	vbroadcasti32x4 zmm10, [rk3] ;xmm10 has rk3 and rk4
	124	;imm value of pclmulqdq instruction will determine which constant to use
	125
	126	sub arg3, 256
	127	cmp arg3, 256
	128	jl .fold_128_B_loop
	129
	130	vmovdqu8 zmm7, [arg2+16*8]
	131	vmovdqu8 zmm8, [arg2+16*12]
	132	vpshufb zmm7, zmm7, zmm18
	133	vpshufb zmm8, zmm8, zmm18
	134	vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
	135	sub arg3, 256
	136
	137	.fold_256_B_loop:
	138	add arg2, 256
	139	vmovdqu8 zmm3, [arg2+16*0]
	140	vpshufb zmm3, zmm3, zmm18
	141	vpclmulqdq zmm1, zmm0, zmm16, 0x00
	142	vpclmulqdq zmm2, zmm0, zmm16, 0x11
	143	vpxorq zmm0, zmm1, zmm2
	144	vpxorq zmm0, zmm0, zmm3
	145
	146	vmovdqu8 zmm9, [arg2+16*4]
	147	vpshufb zmm9, zmm9, zmm18
	148	vpclmulqdq zmm5, zmm4, zmm16, 0x00
	149	vpclmulqdq zmm6, zmm4, zmm16, 0x11
	150	vpxorq zmm4, zmm5, zmm6
	151	vpxorq zmm4, zmm4, zmm9
152
153	vmovdqu8 zmm11, [arg2+16*8]
154	vpshufb zmm11, zmm11, zmm18
155	vpclmulqdq zmm12, zmm7, zmm16, 0x00
156	vpclmulqdq zmm13, zmm7, zmm16, 0x11
157	vpxorq zmm7, zmm12, zmm13
158	vpxorq zmm7, zmm7, zmm11
159
160	vmovdqu8 zmm17, [arg2+16*12]
161	vpshufb zmm17, zmm17, zmm18
162	vpclmulqdq zmm14, zmm8, zmm16, 0x00
163	vpclmulqdq zmm15, zmm8, zmm16, 0x11
164	vpxorq zmm8, zmm14, zmm15
165	vpxorq zmm8, zmm8, zmm17
166
167	sub arg3, 256
168	jge .fold_256_B_loop
169
170	;; Fold 256 into 128
171	add arg2, 256
172	vpclmulqdq zmm1, zmm0, zmm10, 0x00
173	vpclmulqdq zmm2, zmm0, zmm10, 0x11
174	vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
175
176	vpclmulqdq zmm5, zmm4, zmm10, 0x00
177	vpclmulqdq zmm6, zmm4, zmm10, 0x11
178	vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
179
180	vmovdqa32 zmm0, zmm7
181	vmovdqa32 zmm4, zmm8
182
183	add arg3, 128
184	jmp .fold_128_B_register
185
186
187
188	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
189	; loop will fold 128B at a time until we have 128+y Bytes of buffer
190
191	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
192	.fold_128_B_loop:
193	add arg2, 128
194	vmovdqu8 zmm8, [arg2+16*0]
195	vpshufb zmm8, zmm8, zmm18
196	vpclmulqdq zmm2, zmm0, zmm10, 0x00
197	vpclmulqdq zmm1, zmm0, zmm10, 0x11
198	vpxorq zmm0, zmm2, zmm1
199	vpxorq zmm0, zmm0, zmm8
200
201	vmovdqu8 zmm9, [arg2+16*4]
202	vpshufb zmm9, zmm9, zmm18
203	vpclmulqdq zmm5, zmm4, zmm10, 0x00
204	vpclmulqdq zmm6, zmm4, zmm10, 0x11
205	vpxorq zmm4, zmm5, zmm6
206	vpxorq zmm4, zmm4, zmm9
207
208	sub arg3, 128
209	jge .fold_128_B_loop
210	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
211
212	add arg2, 128
213	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
214	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
215
216	.fold_128_B_register:
217	; fold the 8 128b parts into 1 xmm register with different constants
218	vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
219	vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
220	vpclmulqdq zmm1, zmm0, zmm16, 0x00
221	vpclmulqdq zmm2, zmm0, zmm16, 0x11
222	vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
223
224	vpclmulqdq zmm5, zmm4, zmm11, 0x00
225	vpclmulqdq zmm6, zmm4, zmm11, 0x11
226	vmovdqa xmm10, [rk1] ; Needed later in reduction loop
227	vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
228	vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
229
230	vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
231	vpxorq ymm8, ymm8, ymm1
232	vextracti64x2 xmm5, ymm8, 1
233	vpxorq xmm7, xmm5, xmm8
234
235	; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
236	; instead of a cmp instruction, we use the negative flag with the jl instruction
237	add arg3, 128-16
238	jl .final_reduction_for_128
239
240	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
241	; we can fold 16 bytes at a time if y>=16
242	; continue folding 16B at a time
243
244	.16B_reduction_loop:
245	vpclmulqdq xmm8, xmm7, xmm10, 0x11
246	vpclmulqdq xmm7, xmm7, xmm10, 0x00
247	vpxor xmm7, xmm8
248	vmovdqu xmm0, [arg2]
249	vpshufb xmm0, xmm0, xmm18
250	vpxor xmm7, xmm0
251	add arg2, 16
252	sub arg3, 16
253	; instead of a cmp instruction, we utilize the flags with the jge instruction
254	; equivalent of: cmp arg3, 16-16
255	; check if there is any more 16B in the buffer to be able to fold
256	jge .16B_reduction_loop
257
258	;now we have 16+z bytes left to reduce, where 0<= z < 16.
259	;first, we reduce the data in the xmm7 register
260
261
262	.final_reduction_for_128:
263	add arg3, 16
264	je .128_done
265
266	; here we are getting data that is less than 16 bytes.
267	; since we know that there was data before the pointer, we can offset
268	; the input pointer before the actual point, to receive exactly 16 bytes.
269	; after that the registers need to be adjusted.
270	.get_last_two_xmms:
271
272	vmovdqa xmm2, xmm7
273	vmovdqu xmm1, [arg2 - 16 + arg3]
274	vpshufb xmm1, xmm18
275
276	; get rid of the extra data that was loaded before
277	; load the shift constant
278	lea rax, [pshufb_shf_table + 16]
279	sub rax, arg3
280	vmovdqu xmm0, [rax]
281
282	vpshufb xmm2, xmm0
283	vpxor xmm0, [mask1]
284	vpshufb xmm7, xmm0
285	vpblendvb xmm1, xmm1, xmm2, xmm0
286
287	vpclmulqdq xmm8, xmm7, xmm10, 0x11
288	vpclmulqdq xmm7, xmm7, xmm10, 0x00
289	vpxor xmm7, xmm8
290	vpxor xmm7, xmm1
291
292	.128_done:
293	; compute crc of a 128-bit value
294	vmovdqa xmm10, [rk5]
295	vmovdqa xmm0, xmm7
296
297	;64b fold
298	vpclmulqdq xmm7, xmm10, 0x01 ; H*L
299	vpslldq xmm0, 8
300	vpxor xmm7, xmm0
301
302	;32b fold
303	vmovdqa xmm0, xmm7
304	vpand xmm0, [mask2]
305	vpsrldq xmm7, 12
306	vpclmulqdq xmm7, xmm10, 0x10
307	vpxor xmm7, xmm0
308
309	;barrett reduction
310	.barrett:
311	vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
312	vmovdqa xmm0, xmm7
313	vpclmulqdq xmm7, xmm10, 0x01
314	vpslldq xmm7, 4
315	vpclmulqdq xmm7, xmm10, 0x11
316
317	vpslldq xmm7, 4
318	vpxor xmm7, xmm0
319	vpextrd eax, xmm7, 1
320
321	.cleanup:
322	not eax
323
324
325	%ifidn __OUTPUT_FORMAT__, win64
326	vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
327	vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
328	vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
329	vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
330	vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
331	vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
332	vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
333	vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
334	vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
335	vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
336	%endif
337	add rsp, VARIABLE_OFFSET
338	ret
339
340
341	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345
346	align 16
347	.less_than_256:
348
349	; check if there is enough buffer to be able to fold 16B at a time
350	cmp arg3, 32
351	jl .less_than_32
352
353	; if there is, load the constants
354	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
355
356	vmovd xmm0, arg1_low32 ; get the initial crc value
357	vpslldq xmm0, 12 ; align it to its correct place
358	vmovdqu xmm7, [arg2] ; load the plaintext
359	vpshufb xmm7, xmm18 ; byte-reflect the plaintext
360	vpxor xmm7, xmm0
361
362	; update the buffer pointer
363	add arg2, 16
364
365	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
366	sub arg3, 32
367
368	jmp .16B_reduction_loop
369
370
371	align 16
372	.less_than_32:
373	; mov initial crc to the return value. this is necessary for zero-length buffers.
374	mov eax, arg1_low32
375	test arg3, arg3
376	je .cleanup
377
378	vmovd xmm0, arg1_low32 ; get the initial crc value
379	vpslldq xmm0, 12 ; align it to its correct place
380
381	cmp arg3, 16
382	je .exact_16_left
383	jl .less_than_16_left
384
385	vmovdqu xmm7, [arg2] ; load the plaintext
386	vpshufb xmm7, xmm18
387	vpxor xmm7, xmm0 ; xor the initial crc value
388	add arg2, 16
389	sub arg3, 16
390	vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
391	jmp .get_last_two_xmms
392
393	align 16
394	.less_than_16_left:
395	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
396
397	vpxor xmm1, xmm1
398	mov r11, rsp
399	vmovdqa [r11], xmm1
400
401	cmp arg3, 4
402	jl .only_less_than_4
403
404	; backup the counter value
405	mov r9, arg3
406	cmp arg3, 8
407	jl .less_than_8_left
408
409	; load 8 Bytes
410	mov rax, [arg2]
411	mov [r11], rax
412	add r11, 8
413	sub arg3, 8
414	add arg2, 8
415	.less_than_8_left:
416
417	cmp arg3, 4
418	jl .less_than_4_left
419
420	; load 4 Bytes
421	mov eax, [arg2]
422	mov [r11], eax
423	add r11, 4
424	sub arg3, 4
425	add arg2, 4
426	.less_than_4_left:
427
428	cmp arg3, 2
429	jl .less_than_2_left
430
431	; load 2 Bytes
432	mov ax, [arg2]
433	mov [r11], ax
434	add r11, 2
435	sub arg3, 2
436	add arg2, 2
437	.less_than_2_left:
438	cmp arg3, 1
439	jl .zero_left
440
441	; load 1 Byte
442	mov al, [arg2]
443	mov [r11], al
444
445	.zero_left:
446	vmovdqa xmm7, [rsp]
447	vpshufb xmm7, xmm18
448	vpxor xmm7, xmm0 ; xor the initial crc value
449
450	lea rax, [pshufb_shf_table + 16]
451	sub rax, r9
452	vmovdqu xmm0, [rax]
453	vpxor xmm0, [mask1]
454
455	vpshufb xmm7,xmm0
456	jmp .128_done
457
458	align 16
459	.exact_16_left:
460	vmovdqu xmm7, [arg2]
461	vpshufb xmm7, xmm18
462	vpxor xmm7, xmm0 ; xor the initial crc value
463	jmp .128_done
464
465	.only_less_than_4:
466	cmp arg3, 3
467	jl .only_less_than_3
468
469	; load 3 Bytes
470	mov al, [arg2]
471	mov [r11], al
472
473	mov al, [arg2+1]
474	mov [r11+1], al
475
476	mov al, [arg2+2]
477	mov [r11+2], al
478
479	vmovdqa xmm7, [rsp]
480	vpshufb xmm7, xmm18
481	vpxor xmm7, xmm0 ; xor the initial crc value
482
483	vpsrldq xmm7, 5
484	jmp .barrett
485
486	.only_less_than_3:
487	cmp arg3, 2
488	jl .only_less_than_2
489
490	; load 2 Bytes
491	mov al, [arg2]
492	mov [r11], al
493
494	mov al, [arg2+1]
495	mov [r11+1], al
496
497	vmovdqa xmm7, [rsp]
498	vpshufb xmm7, xmm18
499	vpxor xmm7, xmm0 ; xor the initial crc value
500
501	vpsrldq xmm7, 6
502	jmp .barrett
503
504	.only_less_than_2:
505	; load 1 Byte
506	mov al, [arg2]
507	mov [r11], al
508
509	vmovdqa xmm7, [rsp]
510	vpshufb xmm7, xmm18
511	vpxor xmm7, xmm0 ; xor the initial crc value
512
513	vpsrldq xmm7, 7
514	jmp .barrett
515
516	section .data
517	align 32
518
519	%ifndef USE_CONSTS
520	; precomputed constants
521	rk_1: dq 0x1851689900000000
522	rk_2: dq 0xa3dc855100000000
523	rk1: dq 0xf200aa6600000000
524	rk2: dq 0x17d3315d00000000
525	rk3: dq 0x022ffca500000000
526	rk4: dq 0x9d9ee22f00000000
527	rk5: dq 0xf200aa6600000000
528	rk6: dq 0x490d678d00000000
529	rk7: dq 0x0000000104d101df
530	rk8: dq 0x0000000104c11db7
531	rk9: dq 0x6ac7e7d700000000
532	rk10: dq 0xfcd922af00000000
533	rk11: dq 0x34e45a6300000000
534	rk12: dq 0x8762c1f600000000
535	rk13: dq 0x5395a0ea00000000
536	rk14: dq 0x54f2d5c700000000
537	rk15: dq 0xd3504ec700000000
538	rk16: dq 0x57a8445500000000
539	rk17: dq 0xc053585d00000000
540	rk18: dq 0x766f1b7800000000
541	rk19: dq 0xcd8c54b500000000
542	rk20: dq 0xab40b71e00000000
543
544	rk_1b: dq 0xf200aa6600000000
545	rk_2b: dq 0x17d3315d00000000
546	dq 0x0000000000000000
547	dq 0x0000000000000000
548	%else
549	INCLUDE_CONSTS
550	%endif
551
552	mask1: dq 0x8080808080808080, 0x8080808080808080
553	mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
554
555	SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
556
557	pshufb_shf_table:
558	; use these values for shift constants for the pshufb instruction
559	; different alignments result in values as shown:
560	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
561	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
562	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
563	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
564	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
565	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
566	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
567	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
568	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
569	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
570	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
571	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
572	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
573	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
574	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
575	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
576	dq 0x0706050403020100, 0x000e0d0c0b0a0908
577	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
578	dq 0x8080808080808080, 0x8080808080808080
579
580	%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
581	%ifidn __OUTPUT_FORMAT__, win64
582	global no_ %+ FUNCTION_NAME
583	no_ %+ FUNCTION_NAME %+ :
584	%endif
585	%endif ; (AS_FEATURE_LEVEL) >= 10