[ceph.git] / ceph / src / isa-l / crc / crc32_ieee_01.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2015 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       UINT32 crc32_ieee_01(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf

%include "reg_sizes.asm"

%define	fetch_dist	1024
[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8

        %xdefine        arg1_low32 ecx
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx

        %xdefine        arg1_low32 edi
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif
align 16
global	crc32_ieee_01:function
crc32_ieee_01:

	not	arg1_low32      ;~init_crc

	sub	rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
%endif


	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	movd	xmm10, arg1_low32	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	pslldq	xmm10, 12

	movdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	movdqu	xmm0, [arg2+16*0]
	movdqu	xmm1, [arg2+16*1]
	movdqu	xmm2, [arg2+16*2]
	movdqu	xmm3, [arg2+16*3]
	movdqu	xmm4, [arg2+16*4]
	movdqu	xmm5, [arg2+16*5]
	movdqu	xmm6, [arg2+16*6]
	movdqu	xmm7, [arg2+16*7]

	pshufb	xmm0, xmm11
	; XOR the initial_crc value
	pxor	xmm0, xmm10
	pshufb	xmm1, xmm11
	pshufb	xmm2, xmm11
	pshufb	xmm3, xmm11
	pshufb	xmm4, xmm11
	pshufb	xmm5, xmm11
	pshufb	xmm6, xmm11
	pshufb	xmm7, xmm11

	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm9, [arg2+16*0]
	movdqu	xmm12, [arg2+16*1]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm0
	movdqa	xmm13, xmm1
	pclmulqdq	xmm0, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm1, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm0, xmm9
	xorps	xmm0, xmm8
	pxor	xmm1, xmm12
	xorps	xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm9, [arg2+16*2]
	movdqu	xmm12, [arg2+16*3]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm2
	movdqa	xmm13, xmm3
	pclmulqdq	xmm2, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm3, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm2, xmm9
	xorps	xmm2, xmm8
	pxor	xmm3, xmm12
	xorps	xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
	movdqu	xmm9, [arg2+16*4]
	movdqu	xmm12, [arg2+16*5]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm4
	movdqa	xmm13, xmm5
	pclmulqdq	xmm4, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm5, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm4, xmm9
	xorps	xmm4, xmm8
	pxor	xmm5, xmm12
	xorps	xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
	movdqu	xmm9, [arg2+16*6]
	movdqu	xmm12, [arg2+16*7]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm6
	movdqa	xmm13, xmm7
	pclmulqdq	xmm6, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm7, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm6, xmm9
	xorps	xmm6, xmm8
	pxor	xmm7, xmm12
	xorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


	add	arg2, 128
	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3


	; fold the 8 xmm registers to 1 xmm register with different constants

	movdqa	xmm10, [rk9]
	movdqa	xmm8, xmm0
	pclmulqdq	xmm0, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm0

	movdqa	xmm10, [rk11]
	movdqa	xmm8, xmm1
	pclmulqdq	xmm1, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm1

	movdqa	xmm10, [rk13]
	movdqa	xmm8, xmm2
	pclmulqdq	xmm2, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

	movdqa	xmm10, [rk15]
	movdqa	xmm8, xmm3
	pclmulqdq	xmm3, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm3

	movdqa	xmm10, [rk17]
	movdqa	xmm8, xmm4
	pclmulqdq	xmm4, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm4

	movdqa	xmm10, [rk19]
	movdqa	xmm8, xmm5
	pclmulqdq	xmm5, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm5

	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2
									;imm value of pclmulqdq instruction will determine which constant to use
	movdqa	xmm8, xmm6
	pclmulqdq	xmm6, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm11
	pxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm7

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm7, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	movdqa	xmm0, xmm7

	;64b fold
	pclmulqdq	xmm7, xmm10, 0x1
	pslldq	xmm0, 8
	pxor	xmm7, xmm0

	;32b fold
	movdqa	xmm0, xmm7

	pand	xmm0, [mask2]

	psrldq	xmm7, 12
	pclmulqdq	xmm7, xmm10, 0x10
	pxor	xmm7, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	movdqa	xmm0, xmm7
	pclmulqdq	xmm7, xmm10, 0x01
	pslldq	xmm7, 4
	pclmulqdq	xmm7, xmm10, 0x11

	pslldq	xmm7, 4
	pxor	xmm7, xmm0
	pextrd	eax, xmm7,1

_cleanup:
	not     eax
%ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
%endif
	add	rsp,VARIABLE_OFFSET
	ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place
	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop


align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	eax, arg1_low32
	test	arg3, arg3
	je	_cleanup

	movdqa xmm11, [SHUF_MASK]

	movd	xmm0, arg1_low32	; get the initial crc value
	pslldq	xmm0, 12	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms


align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	cmp	arg3, 4
	jl	_only_less_than_4

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	; shl r9, 4
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm7, xmm0
	jmp	_128_done

align 16
_exact_16_left:
	movdqu	xmm7, [arg2]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

_only_less_than_4:
	cmp	arg3, 3
	jl	_only_less_than_3

	; load 3 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	mov	al, [arg2+2]
	mov	[r11+2], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 5

	jmp	_barrett
_only_less_than_3:
	cmp	arg3, 2
	jl	_only_less_than_2

	; load 2 Bytes
	mov	al, [arg2]
	mov	[r11], al

	mov	al, [arg2+1]
	mov	[r11+1], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 6

	jmp	_barrett
_only_less_than_2:

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al

	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	psrldq	xmm7, 7

	jmp	_barrett

section .data

; precomputed constants
align 16

rk1 :
DQ 0xf200aa6600000000
rk2 :
DQ 0x17d3315d00000000
rk3 :
DQ 0x022ffca500000000
rk4 :
DQ 0x9d9ee22f00000000
rk5 :
DQ 0xf200aa6600000000
rk6 :
DQ 0x490d678d00000000
rk7 :
DQ 0x0000000104d101df
rk8 :
DQ 0x0000000104c11db7
rk9 :
DQ 0x6ac7e7d700000000
rk10 :
DQ 0xfcd922af00000000
rk11 :
DQ 0x34e45a6300000000
rk12 :
DQ 0x8762c1f600000000
rk13 :
DQ 0x5395a0ea00000000
rk14 :
DQ 0x54f2d5c700000000
rk15 :
DQ 0xd3504ec700000000
rk16 :
DQ 0x57a8445500000000
rk17 :
DQ 0xc053585d00000000
rk18 :
DQ 0x766f1b7800000000
rk19 :
DQ 0xcd8c54b500000000
rk20 :
DQ 0xab40b71e00000000


mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

;;;       func        core, ver, snum
slversion crc32_ieee_01, 01,   06,  0011
Commit	Line	Data
7c673cae FG	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; UINT32 crc32_ieee_01(
	32	; UINT32 init_crc, //initial CRC value, 32 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; UINT64 len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; Authors:
	38	; Erdinc Ozturk
	39	; Vinodh Gopal
	40	; James Guilford
	41	;
	42	; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
	43	; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
	44
	45	%include "reg_sizes.asm"
	46
224ce89b	47	%define fetch_dist 1024
7c673cae FG	48	[bits 64]
	49	default rel
	50
	51	section .text
	52
	53	%ifidn __OUTPUT_FORMAT__, win64
	54	%xdefine arg1 rcx
	55	%xdefine arg2 rdx
	56	%xdefine arg3 r8
	57
	58	%xdefine arg1_low32 ecx
	59	%else
	60	%xdefine arg1 rdi
	61	%xdefine arg2 rsi
	62	%xdefine arg3 rdx
	63
	64	%xdefine arg1_low32 edi
	65	%endif
	66
	67	%define TMP 16*0
	68	%ifidn __OUTPUT_FORMAT__, win64
	69	%define XMM_SAVE 16*2
	70	%define VARIABLE_OFFSET 16*10+8
	71	%else
	72	%define VARIABLE_OFFSET 16*2+8
	73	%endif
	74	align 16
	75	global crc32_ieee_01:function
	76	crc32_ieee_01:
	77
	78	not arg1_low32 ;~init_crc
	79
	80	sub rsp,VARIABLE_OFFSET
	81
	82	%ifidn __OUTPUT_FORMAT__, win64
	83	; push the xmm registers into the stack to maintain
	84	movdqa [rsp + XMM_SAVE + 16*0], xmm6
	85	movdqa [rsp + XMM_SAVE + 16*1], xmm7
	86	movdqa [rsp + XMM_SAVE + 16*2], xmm8
	87	movdqa [rsp + XMM_SAVE + 16*3], xmm9
	88	movdqa [rsp + XMM_SAVE + 16*4], xmm10
	89	movdqa [rsp + XMM_SAVE + 16*5], xmm11
	90	movdqa [rsp + XMM_SAVE + 16*6], xmm12
	91	movdqa [rsp + XMM_SAVE + 16*7], xmm13
	92	%endif
	93
	94
	95	; check if smaller than 256
	96	cmp arg3, 256
	97
	98	; for sizes less than 256, we can't fold 128B at a time...
	99	jl _less_than_256
	100
	101
	102	; load the initial crc value
	103	movd xmm10, arg1_low32 ; initial crc
	104
	105	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	106	; because data will be byte-reflected and will align with initial crc at correct place.
	107	pslldq xmm10, 12
	108
	109	movdqa xmm11, [SHUF_MASK]
	110	; receive the initial 128B data, xor the initial crc value
	111	movdqu xmm0, [arg2+16*0]
112	movdqu xmm1, [arg2+16*1]
113	movdqu xmm2, [arg2+16*2]
114	movdqu xmm3, [arg2+16*3]
115	movdqu xmm4, [arg2+16*4]
116	movdqu xmm5, [arg2+16*5]
117	movdqu xmm6, [arg2+16*6]
118	movdqu xmm7, [arg2+16*7]
119
120	pshufb xmm0, xmm11
121	; XOR the initial_crc value
122	pxor xmm0, xmm10
123	pshufb xmm1, xmm11
124	pshufb xmm2, xmm11
125	pshufb xmm3, xmm11
126	pshufb xmm4, xmm11
127	pshufb xmm5, xmm11
128	pshufb xmm6, xmm11
129	pshufb xmm7, xmm11
130
131	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
132	;imm value of pclmulqdq instruction will determine which constant to use
133	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
134	; we subtract 256 instead of 128 to save one instruction from the loop
135	sub arg3, 256
136
137	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
138	; loop will fold 128B at a time until we have 128+y Bytes of buffer
139
140
141	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
142	_fold_128_B_loop:
143
144	; update the buffer pointer
145	add arg2, 128 ; buf += 128;
146
224ce89b	147	prefetchnta [arg2+fetch_dist+0]
7c673cae FG	148	movdqu xmm9, [arg2+16*0]
	149	movdqu xmm12, [arg2+16*1]
	150	pshufb xmm9, xmm11
	151	pshufb xmm12, xmm11
	152	movdqa xmm8, xmm0
	153	movdqa xmm13, xmm1
	154	pclmulqdq xmm0, xmm10, 0x0
	155	pclmulqdq xmm8, xmm10 , 0x11
	156	pclmulqdq xmm1, xmm10, 0x0
	157	pclmulqdq xmm13, xmm10 , 0x11
	158	pxor xmm0, xmm9
	159	xorps xmm0, xmm8
	160	pxor xmm1, xmm12
	161	xorps xmm1, xmm13
	162
224ce89b	163	prefetchnta [arg2+fetch_dist+32]
7c673cae FG	164	movdqu xmm9, [arg2+16*2]
	165	movdqu xmm12, [arg2+16*3]
	166	pshufb xmm9, xmm11
	167	pshufb xmm12, xmm11
	168	movdqa xmm8, xmm2
	169	movdqa xmm13, xmm3
	170	pclmulqdq xmm2, xmm10, 0x0
	171	pclmulqdq xmm8, xmm10 , 0x11
	172	pclmulqdq xmm3, xmm10, 0x0
	173	pclmulqdq xmm13, xmm10 , 0x11
	174	pxor xmm2, xmm9
	175	xorps xmm2, xmm8
	176	pxor xmm3, xmm12
	177	xorps xmm3, xmm13
	178
224ce89b	179	prefetchnta [arg2+fetch_dist+64]
7c673cae FG	180	movdqu xmm9, [arg2+16*4]
	181	movdqu xmm12, [arg2+16*5]
	182	pshufb xmm9, xmm11
	183	pshufb xmm12, xmm11
	184	movdqa xmm8, xmm4
	185	movdqa xmm13, xmm5
	186	pclmulqdq xmm4, xmm10, 0x0
	187	pclmulqdq xmm8, xmm10 , 0x11
	188	pclmulqdq xmm5, xmm10, 0x0
	189	pclmulqdq xmm13, xmm10 , 0x11
	190	pxor xmm4, xmm9
	191	xorps xmm4, xmm8
	192	pxor xmm5, xmm12
	193	xorps xmm5, xmm13
	194
224ce89b	195	prefetchnta [arg2+fetch_dist+96]
7c673cae FG	196	movdqu xmm9, [arg2+16*6]
	197	movdqu xmm12, [arg2+16*7]
	198	pshufb xmm9, xmm11
	199	pshufb xmm12, xmm11
	200	movdqa xmm8, xmm6
	201	movdqa xmm13, xmm7
	202	pclmulqdq xmm6, xmm10, 0x0
	203	pclmulqdq xmm8, xmm10 , 0x11
	204	pclmulqdq xmm7, xmm10, 0x0
	205	pclmulqdq xmm13, xmm10 , 0x11
	206	pxor xmm6, xmm9
	207	xorps xmm6, xmm8
	208	pxor xmm7, xmm12
	209	xorps xmm7, xmm13
	210
	211	sub arg3, 128
	212
	213	; check if there is another 128B in the buffer to be able to fold
	214	jge _fold_128_B_loop
	215	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	216
	217
	218	add arg2, 128
	219	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
	220	; the 128 of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
	221
	222
	223	; fold the 8 xmm registers to 1 xmm register with different constants
	224
	225	movdqa xmm10, [rk9]
	226	movdqa xmm8, xmm0
	227	pclmulqdq xmm0, xmm10, 0x11
	228	pclmulqdq xmm8, xmm10, 0x0
	229	pxor xmm7, xmm8
	230	xorps xmm7, xmm0
	231
	232	movdqa xmm10, [rk11]
	233	movdqa xmm8, xmm1
	234	pclmulqdq xmm1, xmm10, 0x11
	235	pclmulqdq xmm8, xmm10, 0x0
	236	pxor xmm7, xmm8
	237	xorps xmm7, xmm1
	238
	239	movdqa xmm10, [rk13]
	240	movdqa xmm8, xmm2
	241	pclmulqdq xmm2, xmm10, 0x11
	242	pclmulqdq xmm8, xmm10, 0x0
	243	pxor xmm7, xmm8
	244	pxor xmm7, xmm2
	245
	246	movdqa xmm10, [rk15]
	247	movdqa xmm8, xmm3
	248	pclmulqdq xmm3, xmm10, 0x11
	249	pclmulqdq xmm8, xmm10, 0x0
	250	pxor xmm7, xmm8
	251	xorps xmm7, xmm3
	252
	253	movdqa xmm10, [rk17]
	254	movdqa xmm8, xmm4
	255	pclmulqdq xmm4, xmm10, 0x11
	256	pclmulqdq xmm8, xmm10, 0x0
	257	pxor xmm7, xmm8
	258	pxor xmm7, xmm4
	259
260	movdqa xmm10, [rk19]
261	movdqa xmm8, xmm5
262	pclmulqdq xmm5, xmm10, 0x11
263	pclmulqdq xmm8, xmm10, 0x0
264	pxor xmm7, xmm8
265	xorps xmm7, xmm5
266
267	movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
268	;imm value of pclmulqdq instruction will determine which constant to use
269	movdqa xmm8, xmm6
270	pclmulqdq xmm6, xmm10, 0x11
271	pclmulqdq xmm8, xmm10, 0x0
272	pxor xmm7, xmm8
273	pxor xmm7, xmm6
274
275
276	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277	; instead of a cmp instruction, we use the negative flag with the jl instruction
278	add arg3, 128-16
279	jl _final_reduction_for_128
280
281	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282	; we can fold 16 bytes at a time if y>=16
283	; continue folding 16B at a time
284
285	_16B_reduction_loop:
286	movdqa xmm8, xmm7
287	pclmulqdq xmm7, xmm10, 0x11
288	pclmulqdq xmm8, xmm10, 0x0
289	pxor xmm7, xmm8
290	movdqu xmm0, [arg2]
291	pshufb xmm0, xmm11
292	pxor xmm7, xmm0
293	add arg2, 16
294	sub arg3, 16
295	; instead of a cmp instruction, we utilize the flags with the jge instruction
296	; equivalent of: cmp arg3, 16-16
297	; check if there is any more 16B in the buffer to be able to fold
298	jge _16B_reduction_loop
299
300	;now we have 16+z bytes left to reduce, where 0<= z < 16.
301	;first, we reduce the data in the xmm7 register
302
303
304	_final_reduction_for_128:
305	; check if any more data to fold. If not, compute the CRC of the final 128 bits
306	add arg3, 16
307	je _128_done
308
309	; here we are getting data that is less than 16 bytes.
310	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311	; after that the registers need to be adjusted.
312	_get_last_two_xmms:
313	movdqa xmm2, xmm7
314
315	movdqu xmm1, [arg2 - 16 + arg3]
316	pshufb xmm1, xmm11
317
318	; get rid of the extra data that was loaded before
319	; load the shift constant
320	lea rax, [pshufb_shf_table + 16]
321	sub rax, arg3
322	movdqu xmm0, [rax]
323
324	; shift xmm2 to the left by arg3 bytes
325	pshufb xmm2, xmm0
326
327	; shift xmm7 to the right by 16-arg3 bytes
328	pxor xmm0, [mask1]
329	pshufb xmm7, xmm0
330	pblendvb xmm1, xmm2 ;xmm0 is implicit
331
332	; fold 16 Bytes
333	movdqa xmm2, xmm1
334	movdqa xmm8, xmm7
335	pclmulqdq xmm7, xmm10, 0x11
336	pclmulqdq xmm8, xmm10, 0x0
337	pxor xmm7, xmm8
338	pxor xmm7, xmm2
339
340	_128_done:
341	; compute crc of a 128-bit value
342	movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
343	movdqa xmm0, xmm7
344
345	;64b fold
346	pclmulqdq xmm7, xmm10, 0x1
347	pslldq xmm0, 8
348	pxor xmm7, xmm0
349
350	;32b fold
351	movdqa xmm0, xmm7
352
353	pand xmm0, [mask2]
354
355	psrldq xmm7, 12
356	pclmulqdq xmm7, xmm10, 0x10
357	pxor xmm7, xmm0
358
359	;barrett reduction
360	_barrett:
361	movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
362	movdqa xmm0, xmm7
363	pclmulqdq xmm7, xmm10, 0x01
364	pslldq xmm7, 4
365	pclmulqdq xmm7, xmm10, 0x11
366
367	pslldq xmm7, 4
368	pxor xmm7, xmm0
369	pextrd eax, xmm7,1
370
371	_cleanup:
372	not eax
373	%ifidn __OUTPUT_FORMAT__, win64
374	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
375	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
376	movdqa xmm8, [rsp + XMM_SAVE + 16*2]
377	movdqa xmm9, [rsp + XMM_SAVE + 16*3]
378	movdqa xmm10, [rsp + XMM_SAVE + 16*4]
379	movdqa xmm11, [rsp + XMM_SAVE + 16*5]
380	movdqa xmm12, [rsp + XMM_SAVE + 16*6]
381	movdqa xmm13, [rsp + XMM_SAVE + 16*7]
382	%endif
383	add rsp,VARIABLE_OFFSET
384	ret
385
386
387	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
388	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391
392	align 16
393	_less_than_256:
394
395	; check if there is enough buffer to be able to fold 16B at a time
396	cmp arg3, 32
397	jl _less_than_32
398	movdqa xmm11, [SHUF_MASK]
399
400	; if there is, load the constants
401	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
402
403	movd xmm0, arg1_low32 ; get the initial crc value
404	pslldq xmm0, 12 ; align it to its correct place
405	movdqu xmm7, [arg2] ; load the plaintext
406	pshufb xmm7, xmm11 ; byte-reflect the plaintext
407	pxor xmm7, xmm0
408
409
410	; update the buffer pointer
411	add arg2, 16
412
413	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
414	sub arg3, 32
415
416	jmp _16B_reduction_loop
417
418
419	align 16
420	_less_than_32:
421	; mov initial crc to the return value. this is necessary for zero-length buffers.
422	mov eax, arg1_low32
423	test arg3, arg3
424	je _cleanup
425
426	movdqa xmm11, [SHUF_MASK]
427
428	movd xmm0, arg1_low32 ; get the initial crc value
429	pslldq xmm0, 12 ; align it to its correct place
430
431	cmp arg3, 16
432	je _exact_16_left
433	jl _less_than_16_left
434
435	movdqu xmm7, [arg2] ; load the plaintext
436	pshufb xmm7, xmm11 ; byte-reflect the plaintext
437	pxor xmm7, xmm0 ; xor the initial crc value
438	add arg2, 16
439	sub arg3, 16
440	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
441	jmp _get_last_two_xmms
442
443
444	align 16
445	_less_than_16_left:
446	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
447
448	pxor xmm1, xmm1
449	mov r11, rsp
450	movdqa [r11], xmm1
451
452	cmp arg3, 4
453	jl _only_less_than_4
454
455	; backup the counter value
456	mov r9, arg3
457	cmp arg3, 8
458	jl _less_than_8_left
459
460	; load 8 Bytes
461	mov rax, [arg2]
462	mov [r11], rax
463	add r11, 8
464	sub arg3, 8
465	add arg2, 8
466	_less_than_8_left:
467
468	cmp arg3, 4
469	jl _less_than_4_left
470
471	; load 4 Bytes
472	mov eax, [arg2]
473	mov [r11], eax
474	add r11, 4
475	sub arg3, 4
476	add arg2, 4
477	_less_than_4_left:
478
479	cmp arg3, 2
480	jl _less_than_2_left
481
482	; load 2 Bytes
483	mov ax, [arg2]
484	mov [r11], ax
485	add r11, 2
486	sub arg3, 2
487	add arg2, 2
488	_less_than_2_left:
489	cmp arg3, 1
490	jl _zero_left
491
492	; load 1 Byte
493	mov al, [arg2]
494	mov [r11], al
495	_zero_left:
496	movdqa xmm7, [rsp]
497	pshufb xmm7, xmm11
498	pxor xmm7, xmm0 ; xor the initial crc value
499
500	; shl r9, 4
501	lea rax, [pshufb_shf_table + 16]
502	sub rax, r9
503	movdqu xmm0, [rax]
504	pxor xmm0, [mask1]
505
506	pshufb xmm7, xmm0
507	jmp _128_done
508
509	align 16
510	_exact_16_left:
511	movdqu xmm7, [arg2]
512	pshufb xmm7, xmm11
513	pxor xmm7, xmm0 ; xor the initial crc value
514
515	jmp _128_done
516
517	_only_less_than_4:
518	cmp arg3, 3
519	jl _only_less_than_3
520
521	; load 3 Bytes
522	mov al, [arg2]
523	mov [r11], al
524
525	mov al, [arg2+1]
526	mov [r11+1], al
527
528	mov al, [arg2+2]
529	mov [r11+2], al
530
531	movdqa xmm7, [rsp]
532	pshufb xmm7, xmm11
533	pxor xmm7, xmm0 ; xor the initial crc value
534
535	psrldq xmm7, 5
536
537	jmp _barrett
538	_only_less_than_3:
539	cmp arg3, 2
540	jl _only_less_than_2
541
542	; load 2 Bytes
543	mov al, [arg2]
544	mov [r11], al
545
546	mov al, [arg2+1]
547	mov [r11+1], al
548
549	movdqa xmm7, [rsp]
550	pshufb xmm7, xmm11
551	pxor xmm7, xmm0 ; xor the initial crc value
552
553	psrldq xmm7, 6
554
555	jmp _barrett
556	_only_less_than_2:
557
558	; load 1 Byte
559	mov al, [arg2]
560	mov [r11], al
561
562	movdqa xmm7, [rsp]
563	pshufb xmm7, xmm11
564	pxor xmm7, xmm0 ; xor the initial crc value
565
566	psrldq xmm7, 7
567
568	jmp _barrett
569
570	section .data
571
572	; precomputed constants
573	align 16
574
575	rk1 :
576	DQ 0xf200aa6600000000
577	rk2 :
578	DQ 0x17d3315d00000000
579	rk3 :
580	DQ 0x022ffca500000000
581	rk4 :
582	DQ 0x9d9ee22f00000000
583	rk5 :
584	DQ 0xf200aa6600000000
585	rk6 :
586	DQ 0x490d678d00000000
587	rk7 :
588	DQ 0x0000000104d101df
589	rk8 :
590	DQ 0x0000000104c11db7
591	rk9 :
592	DQ 0x6ac7e7d700000000
593	rk10 :
594	DQ 0xfcd922af00000000
595	rk11 :
596	DQ 0x34e45a6300000000
597	rk12 :
598	DQ 0x8762c1f600000000
599	rk13 :
600	DQ 0x5395a0ea00000000
601	rk14 :
602	DQ 0x54f2d5c700000000
603	rk15 :
604	DQ 0xd3504ec700000000
605	rk16 :
606	DQ 0x57a8445500000000
607	rk17 :
608	DQ 0xc053585d00000000
609	rk18 :
610	DQ 0x766f1b7800000000
611	rk19 :
612	DQ 0xcd8c54b500000000
613	rk20 :
614	DQ 0xab40b71e00000000
615
616
617
618
619
620
621
622
623
624	mask1:
625	dq 0x8080808080808080, 0x8080808080808080
626	mask2:
627	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
628
629	SHUF_MASK:
630	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
631
632	pshufb_shf_table:
633	; use these values for shift constants for the pshufb instruction
634	; different alignments result in values as shown:
635	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
636	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
637	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
638	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
639	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
640	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
641	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
642	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
643	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
644	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
645	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
646	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
647	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
648	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
649	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
650	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
651	dq 0x0706050403020100, 0x000e0d0c0b0a0908
652
653	;;; func core, ver, snum
654	slversion crc32_ieee_01, 01, 06, 0011
655