[ceph.git] / ceph / src / isa-l / crc / crc64_ecma_norm_by8.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       uint64_t crc64_ecma_norm_by8(
;               uint64_t init_crc, //initial CRC value, 64 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               uint64_t len //buffer length in bytes (64-bit data)
;       );
;
;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif
align 16
mk_global 	crc64_ecma_norm_by8, function
crc64_ecma_norm_by8:
	endbranch

	not	arg1      ;~init_crc

	sub	rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
%endif


	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	movq	xmm10, arg1	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	pslldq	xmm10, 8

	movdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	movdqu	xmm0, [arg2+16*0]
	movdqu	xmm1, [arg2+16*1]
	movdqu	xmm2, [arg2+16*2]
	movdqu	xmm3, [arg2+16*3]
	movdqu	xmm4, [arg2+16*4]
	movdqu	xmm5, [arg2+16*5]
	movdqu	xmm6, [arg2+16*6]
	movdqu	xmm7, [arg2+16*7]

	pshufb	xmm0, xmm11
	; XOR the initial_crc value
	pxor	xmm0, xmm10
	pshufb	xmm1, xmm11
	pshufb	xmm2, xmm11
	pshufb	xmm3, xmm11
	pshufb	xmm4, xmm11
	pshufb	xmm5, xmm11
	pshufb	xmm6, xmm11
	pshufb	xmm7, xmm11

	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm9, [arg2+16*0]
	movdqu	xmm12, [arg2+16*1]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm0
	movdqa	xmm13, xmm1
	pclmulqdq	xmm0, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm1, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm0, xmm9
	xorps	xmm0, xmm8
	pxor	xmm1, xmm12
	xorps	xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm9, [arg2+16*2]
	movdqu	xmm12, [arg2+16*3]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm2
	movdqa	xmm13, xmm3
	pclmulqdq	xmm2, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm3, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm2, xmm9
	xorps	xmm2, xmm8
	pxor	xmm3, xmm12
	xorps	xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
	movdqu	xmm9, [arg2+16*4]
	movdqu	xmm12, [arg2+16*5]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm4
	movdqa	xmm13, xmm5
	pclmulqdq	xmm4, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm5, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm4, xmm9
	xorps	xmm4, xmm8
	pxor	xmm5, xmm12
	xorps	xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
	movdqu	xmm9, [arg2+16*6]
	movdqu	xmm12, [arg2+16*7]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm6
	movdqa	xmm13, xmm7
	pclmulqdq	xmm6, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm7, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm6, xmm9
	xorps	xmm6, xmm8
	pxor	xmm7, xmm12
	xorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add	arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7


	; fold the 8 xmm registers to 1 xmm register with different constants

	movdqa	xmm10, [rk9]
	movdqa	xmm8, xmm0
	pclmulqdq	xmm0, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm0

	movdqa	xmm10, [rk11]
	movdqa	xmm8, xmm1
	pclmulqdq	xmm1, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm1

	movdqa	xmm10, [rk13]
	movdqa	xmm8, xmm2
	pclmulqdq	xmm2, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

	movdqa	xmm10, [rk15]
	movdqa	xmm8, xmm3
	pclmulqdq	xmm3, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm3

	movdqa	xmm10, [rk17]
	movdqa	xmm8, xmm4
	pclmulqdq	xmm4, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm4

	movdqa	xmm10, [rk19]
	movdqa	xmm8, xmm5
	pclmulqdq	xmm5, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm5

	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2

	movdqa	xmm8, xmm6
	pclmulqdq	xmm6, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm11
	pxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm7

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm7, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	movdqa	xmm0, xmm7

	;64b fold
	pclmulqdq	xmm7, xmm10, 0x01	; H*L
	pslldq	xmm0, 8
	pxor	xmm7, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	movdqa	xmm0, xmm7

	movdqa	xmm1, xmm7
        pand    xmm1, [mask3]
	pclmulqdq	xmm7, xmm10, 0x01
	pxor	xmm7, xmm1

	pclmulqdq	xmm7, xmm10, 0x11
	pxor	xmm7, xmm0
	pextrq	rax, xmm7, 0

_cleanup:
	not     rax
%ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
%endif
	add	rsp, VARIABLE_OFFSET
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	movq	xmm0, arg1	; get the initial crc value
	pslldq	xmm0, 8	; align it to its correct place
	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop
align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	rax, arg1
	test	arg3, arg3
	je	_cleanup

	movdqa xmm11, [SHUF_MASK]

	movq	xmm0, arg1	; get the initial crc value
	pslldq	xmm0, 8	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms
align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	; shl r9, 4
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9

	cmp     r9, 8
        jl      _end_1to7

_end_8to15:
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm7, xmm0
	jmp	_128_done

_end_1to7:
	; Right shift (8-length) bytes in XMM
	add	rax, 8
        movdqu  xmm0, [rax]
        pshufb  xmm7,xmm0

        jmp     _barrett
align 16
_exact_16_left:
	movdqu	xmm7, [arg2]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

section .data

; precomputed constants
align 16

rk1 :
DQ 0x5f5c3c7eb52fab6
rk2 :
DQ 0x4eb938a7d257740e
rk3 :
DQ 0x5cf79dea9ac37d6
rk4 :
DQ 0x001067e571d7d5c2
rk5 :
DQ 0x5f5c3c7eb52fab6
rk6 :
DQ 0x0000000000000000
rk7 :
DQ 0x578d29d06cc4f872
rk8 :
DQ 0x42f0e1eba9ea3693
rk9 :
DQ 0xe464f4df5fb60ac1
rk10 :
DQ 0xb649c5b35a759cf2
rk11 :
DQ 0x9af04e1eff82d0dd
rk12 :
DQ 0x6e82e609297f8fe8
rk13 :
DQ 0x97c516e98bd2e73
rk14 :
DQ 0xb76477b31e22e7b
rk15 :
DQ 0x5f6843ca540df020
rk16 :
DQ 0xddf4b6981205b83f
rk17 :
DQ 0x54819d8713758b2c
rk18 :
DQ 0x4a6b90073eb0af5a
rk19 :
DQ 0x571bee0a227ef92b
rk20 :
DQ 0x44bef2a201b5200c


mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
mask3:
dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

;;;       func        core, ver, snum
slversion crc64_ecma_norm_by8, 01,   00,  001a
Commit	Line	Data
224ce89b WB	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; uint64_t crc64_ecma_norm_by8(
	32	; uint64_t init_crc, //initial CRC value, 64 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; uint64_t len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
	38	%include "reg_sizes.asm"
	39
	40	%define fetch_dist 1024
	41
	42	[bits 64]
	43	default rel
	44
	45	section .text
	46
	47	%ifidn __OUTPUT_FORMAT__, win64
	48	%xdefine arg1 rcx
	49	%xdefine arg2 rdx
	50	%xdefine arg3 r8
	51	%else
	52	%xdefine arg1 rdi
	53	%xdefine arg2 rsi
	54	%xdefine arg3 rdx
	55	%endif
	56
	57	%define TMP 16*0
	58	%ifidn __OUTPUT_FORMAT__, win64
	59	%define XMM_SAVE 16*2
	60	%define VARIABLE_OFFSET 16*10+8
	61	%else
	62	%define VARIABLE_OFFSET 16*2+8
	63	%endif
	64	align 16
20effc67	65	mk_global crc64_ecma_norm_by8, function
224ce89b	66	crc64_ecma_norm_by8:
20effc67	67	endbranch
224ce89b WB	68
	69	not arg1 ;~init_crc
	70
	71	sub rsp,VARIABLE_OFFSET
	72
	73	%ifidn __OUTPUT_FORMAT__, win64
	74	; push the xmm registers into the stack to maintain
	75	movdqa [rsp + XMM_SAVE + 16*0], xmm6
	76	movdqa [rsp + XMM_SAVE + 16*1], xmm7
	77	movdqa [rsp + XMM_SAVE + 16*2], xmm8
	78	movdqa [rsp + XMM_SAVE + 16*3], xmm9
	79	movdqa [rsp + XMM_SAVE + 16*4], xmm10
	80	movdqa [rsp + XMM_SAVE + 16*5], xmm11
	81	movdqa [rsp + XMM_SAVE + 16*6], xmm12
	82	movdqa [rsp + XMM_SAVE + 16*7], xmm13
	83	%endif
	84
	85
	86	; check if smaller than 256
	87	cmp arg3, 256
	88
	89	; for sizes less than 256, we can't fold 128B at a time...
	90	jl _less_than_256
	91
	92
	93	; load the initial crc value
	94	movq xmm10, arg1 ; initial crc
	95
	96	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	97	; because data will be byte-reflected and will align with initial crc at correct place.
	98	pslldq xmm10, 8
	99
	100	movdqa xmm11, [SHUF_MASK]
	101	; receive the initial 128B data, xor the initial crc value
	102	movdqu xmm0, [arg2+16*0]
	103	movdqu xmm1, [arg2+16*1]
	104	movdqu xmm2, [arg2+16*2]
	105	movdqu xmm3, [arg2+16*3]
	106	movdqu xmm4, [arg2+16*4]
	107	movdqu xmm5, [arg2+16*5]
	108	movdqu xmm6, [arg2+16*6]
	109	movdqu xmm7, [arg2+16*7]
	110
	111	pshufb xmm0, xmm11
	112	; XOR the initial_crc value
	113	pxor xmm0, xmm10
	114	pshufb xmm1, xmm11
	115	pshufb xmm2, xmm11
	116	pshufb xmm3, xmm11
	117	pshufb xmm4, xmm11
	118	pshufb xmm5, xmm11
	119	pshufb xmm6, xmm11
	120	pshufb xmm7, xmm11
	121
	122	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
	123	;imm value of pclmulqdq instruction will determine which constant to use
	124	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	125	; we subtract 256 instead of 128 to save one instruction from the loop
	126	sub arg3, 256
	127
	128	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	129	; loop will fold 128B at a time until we have 128+y Bytes of buffer
	130
	131
132	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
133	_fold_128_B_loop:
134
135	; update the buffer pointer
136	add arg2, 128 ; buf += 128;
137
138	prefetchnta [arg2+fetch_dist+0]
139	movdqu xmm9, [arg2+16*0]
140	movdqu xmm12, [arg2+16*1]
141	pshufb xmm9, xmm11
142	pshufb xmm12, xmm11
143	movdqa xmm8, xmm0
144	movdqa xmm13, xmm1
145	pclmulqdq xmm0, xmm10, 0x0
146	pclmulqdq xmm8, xmm10 , 0x11
147	pclmulqdq xmm1, xmm10, 0x0
148	pclmulqdq xmm13, xmm10 , 0x11
149	pxor xmm0, xmm9
150	xorps xmm0, xmm8
151	pxor xmm1, xmm12
152	xorps xmm1, xmm13
153
154	prefetchnta [arg2+fetch_dist+32]
155	movdqu xmm9, [arg2+16*2]
156	movdqu xmm12, [arg2+16*3]
157	pshufb xmm9, xmm11
158	pshufb xmm12, xmm11
159	movdqa xmm8, xmm2
160	movdqa xmm13, xmm3
161	pclmulqdq xmm2, xmm10, 0x0
162	pclmulqdq xmm8, xmm10 , 0x11
163	pclmulqdq xmm3, xmm10, 0x0
164	pclmulqdq xmm13, xmm10 , 0x11
165	pxor xmm2, xmm9
166	xorps xmm2, xmm8
167	pxor xmm3, xmm12
168	xorps xmm3, xmm13
169
170	prefetchnta [arg2+fetch_dist+64]
171	movdqu xmm9, [arg2+16*4]
172	movdqu xmm12, [arg2+16*5]
173	pshufb xmm9, xmm11
174	pshufb xmm12, xmm11
175	movdqa xmm8, xmm4
176	movdqa xmm13, xmm5
177	pclmulqdq xmm4, xmm10, 0x0
178	pclmulqdq xmm8, xmm10 , 0x11
179	pclmulqdq xmm5, xmm10, 0x0
180	pclmulqdq xmm13, xmm10 , 0x11
181	pxor xmm4, xmm9
182	xorps xmm4, xmm8
183	pxor xmm5, xmm12
184	xorps xmm5, xmm13
185
186	prefetchnta [arg2+fetch_dist+96]
187	movdqu xmm9, [arg2+16*6]
188	movdqu xmm12, [arg2+16*7]
189	pshufb xmm9, xmm11
190	pshufb xmm12, xmm11
191	movdqa xmm8, xmm6
192	movdqa xmm13, xmm7
193	pclmulqdq xmm6, xmm10, 0x0
194	pclmulqdq xmm8, xmm10 , 0x11
195	pclmulqdq xmm7, xmm10, 0x0
196	pclmulqdq xmm13, xmm10 , 0x11
197	pxor xmm6, xmm9
198	xorps xmm6, xmm8
199	pxor xmm7, xmm12
200	xorps xmm7, xmm13
201
202	sub arg3, 128
203
204	; check if there is another 128B in the buffer to be able to fold
205	jge _fold_128_B_loop
206	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207
208	add arg2, 128
209	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
210	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
211
212
213	; fold the 8 xmm registers to 1 xmm register with different constants
214
215	movdqa xmm10, [rk9]
216	movdqa xmm8, xmm0
217	pclmulqdq xmm0, xmm10, 0x11
218	pclmulqdq xmm8, xmm10, 0x0
219	pxor xmm7, xmm8
220	xorps xmm7, xmm0
221
222	movdqa xmm10, [rk11]
223	movdqa xmm8, xmm1
224	pclmulqdq xmm1, xmm10, 0x11
225	pclmulqdq xmm8, xmm10, 0x0
226	pxor xmm7, xmm8
227	xorps xmm7, xmm1
228
229	movdqa xmm10, [rk13]
230	movdqa xmm8, xmm2
231	pclmulqdq xmm2, xmm10, 0x11
232	pclmulqdq xmm8, xmm10, 0x0
233	pxor xmm7, xmm8
234	pxor xmm7, xmm2
235
236	movdqa xmm10, [rk15]
237	movdqa xmm8, xmm3
238	pclmulqdq xmm3, xmm10, 0x11
239	pclmulqdq xmm8, xmm10, 0x0
240	pxor xmm7, xmm8
241	xorps xmm7, xmm3
242
243	movdqa xmm10, [rk17]
244	movdqa xmm8, xmm4
245	pclmulqdq xmm4, xmm10, 0x11
246	pclmulqdq xmm8, xmm10, 0x0
247	pxor xmm7, xmm8
248	pxor xmm7, xmm4
249
250	movdqa xmm10, [rk19]
251	movdqa xmm8, xmm5
252	pclmulqdq xmm5, xmm10, 0x11
253	pclmulqdq xmm8, xmm10, 0x0
254	pxor xmm7, xmm8
255	xorps xmm7, xmm5
256
257	movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
258
259	movdqa xmm8, xmm6
260	pclmulqdq xmm6, xmm10, 0x11
261	pclmulqdq xmm8, xmm10, 0x0
262	pxor xmm7, xmm8
263	pxor xmm7, xmm6
264
265
266	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
267	; instead of a cmp instruction, we use the negative flag with the jl instruction
268	add arg3, 128-16
269	jl _final_reduction_for_128
270
271	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
272	; we can fold 16 bytes at a time if y>=16
273	; continue folding 16B at a time
274
275	_16B_reduction_loop:
276	movdqa xmm8, xmm7
277	pclmulqdq xmm7, xmm10, 0x11
278	pclmulqdq xmm8, xmm10, 0x0
279	pxor xmm7, xmm8
280	movdqu xmm0, [arg2]
281	pshufb xmm0, xmm11
282	pxor xmm7, xmm0
283	add arg2, 16
284	sub arg3, 16
285	; instead of a cmp instruction, we utilize the flags with the jge instruction
286	; equivalent of: cmp arg3, 16-16
287	; check if there is any more 16B in the buffer to be able to fold
288	jge _16B_reduction_loop
289
290	;now we have 16+z bytes left to reduce, where 0<= z < 16.
291	;first, we reduce the data in the xmm7 register
292
293
294	_final_reduction_for_128:
295	; check if any more data to fold. If not, compute the CRC of the final 128 bits
296	add arg3, 16
297	je _128_done
298
299	; here we are getting data that is less than 16 bytes.
300	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
301	; after that the registers need to be adjusted.
302	_get_last_two_xmms:
303	movdqa xmm2, xmm7
304
305	movdqu xmm1, [arg2 - 16 + arg3]
306	pshufb xmm1, xmm11
307
308	; get rid of the extra data that was loaded before
309	; load the shift constant
310	lea rax, [pshufb_shf_table + 16]
311	sub rax, arg3
312	movdqu xmm0, [rax]
313
314	; shift xmm2 to the left by arg3 bytes
315	pshufb xmm2, xmm0
316
317	; shift xmm7 to the right by 16-arg3 bytes
318	pxor xmm0, [mask1]
319	pshufb xmm7, xmm0
320	pblendvb xmm1, xmm2 ;xmm0 is implicit
321
322	; fold 16 Bytes
323	movdqa xmm2, xmm1
324	movdqa xmm8, xmm7
325	pclmulqdq xmm7, xmm10, 0x11
326	pclmulqdq xmm8, xmm10, 0x0
327	pxor xmm7, xmm8
328	pxor xmm7, xmm2
329
330	_128_done:
331	; compute crc of a 128-bit value
332	movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
333	movdqa xmm0, xmm7
334
335	;64b fold
336	pclmulqdq xmm7, xmm10, 0x01 ; H*L
337	pslldq xmm0, 8
338	pxor xmm7, xmm0
339
340	;barrett reduction
341	_barrett:
342	movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
343	movdqa xmm0, xmm7
344
345	movdqa xmm1, xmm7
346	pand xmm1, [mask3]
347	pclmulqdq xmm7, xmm10, 0x01
348	pxor xmm7, xmm1
349
350	pclmulqdq xmm7, xmm10, 0x11
351	pxor xmm7, xmm0
352	pextrq rax, xmm7, 0
353
354	_cleanup:
355	not rax
356	%ifidn __OUTPUT_FORMAT__, win64
357	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
358	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
359	movdqa xmm8, [rsp + XMM_SAVE + 16*2]
360	movdqa xmm9, [rsp + XMM_SAVE + 16*3]
361	movdqa xmm10, [rsp + XMM_SAVE + 16*4]
362	movdqa xmm11, [rsp + XMM_SAVE + 16*5]
363	movdqa xmm12, [rsp + XMM_SAVE + 16*6]
364	movdqa xmm13, [rsp + XMM_SAVE + 16*7]
365	%endif
366	add rsp, VARIABLE_OFFSET
367	ret
368
369	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
370	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
371	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
372	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
373
374	align 16
375	_less_than_256:
376
377	; check if there is enough buffer to be able to fold 16B at a time
378	cmp arg3, 32
379	jl _less_than_32
380	movdqa xmm11, [SHUF_MASK]
381
382	; if there is, load the constants
383	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
384
385	movq xmm0, arg1 ; get the initial crc value
386	pslldq xmm0, 8 ; align it to its correct place
387	movdqu xmm7, [arg2] ; load the plaintext
388	pshufb xmm7, xmm11 ; byte-reflect the plaintext
389	pxor xmm7, xmm0
390
391
392	; update the buffer pointer
393	add arg2, 16
394
395	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
396	sub arg3, 32
397
398	jmp _16B_reduction_loop
399	align 16
400	_less_than_32:
401	; mov initial crc to the return value. this is necessary for zero-length buffers.
402	mov rax, arg1
403	test arg3, arg3
404	je _cleanup
405
406	movdqa xmm11, [SHUF_MASK]
407
408	movq xmm0, arg1 ; get the initial crc value
409	pslldq xmm0, 8 ; align it to its correct place
410
411	cmp arg3, 16
412	je _exact_16_left
413	jl _less_than_16_left
414
415	movdqu xmm7, [arg2] ; load the plaintext
416	pshufb xmm7, xmm11 ; byte-reflect the plaintext
417	pxor xmm7, xmm0 ; xor the initial crc value
418	add arg2, 16
419	sub arg3, 16
420	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
421	jmp _get_last_two_xmms
422	align 16
423	_less_than_16_left:
424	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
425	pxor xmm1, xmm1
426	mov r11, rsp
427	movdqa [r11], xmm1
428
429	; backup the counter value
430	mov r9, arg3
431	cmp arg3, 8
432	jl _less_than_8_left
433
434	; load 8 Bytes
435	mov rax, [arg2]
436	mov [r11], rax
437	add r11, 8
438	sub arg3, 8
439	add arg2, 8
440	_less_than_8_left:
441
442	cmp arg3, 4
443	jl _less_than_4_left
444
445	; load 4 Bytes
446	mov eax, [arg2]
447	mov [r11], eax
448	add r11, 4
449	sub arg3, 4
450	add arg2, 4
451	_less_than_4_left:
452
453	cmp arg3, 2
454	jl _less_than_2_left
455
456	; load 2 Bytes
457	mov ax, [arg2]
458	mov [r11], ax
459	add r11, 2
460	sub arg3, 2
461	add arg2, 2
462	_less_than_2_left:
463	cmp arg3, 1
464	jl _zero_left
465
466	; load 1 Byte
467	mov al, [arg2]
468	mov [r11], al
469	_zero_left:
470	movdqa xmm7, [rsp]
471	pshufb xmm7, xmm11
472	pxor xmm7, xmm0 ; xor the initial crc value
473
474	; shl r9, 4
475	lea rax, [pshufb_shf_table + 16]
476	sub rax, r9
477
478	cmp r9, 8
479	jl _end_1to7
480
481	_end_8to15:
482	movdqu xmm0, [rax]
483	pxor xmm0, [mask1]
484
485	pshufb xmm7, xmm0
486	jmp _128_done
487
488	_end_1to7:
489	; Right shift (8-length) bytes in XMM
490	add rax, 8
491	movdqu xmm0, [rax]
492	pshufb xmm7,xmm0
493
494	jmp _barrett
495	align 16
496	_exact_16_left:
497	movdqu xmm7, [arg2]
498	pshufb xmm7, xmm11
499	pxor xmm7, xmm0 ; xor the initial crc value
500
501	jmp _128_done
502
503	section .data
504
505	; precomputed constants
506	align 16
507
508	rk1 :
509	DQ 0x5f5c3c7eb52fab6
510	rk2 :
511	DQ 0x4eb938a7d257740e
512	rk3 :
513	DQ 0x5cf79dea9ac37d6
514	rk4 :
515	DQ 0x001067e571d7d5c2
516	rk5 :
517	DQ 0x5f5c3c7eb52fab6
518	rk6 :
519	DQ 0x0000000000000000
520	rk7 :
521	DQ 0x578d29d06cc4f872
522	rk8 :
523	DQ 0x42f0e1eba9ea3693
524	rk9 :
525	DQ 0xe464f4df5fb60ac1
526	rk10 :
527	DQ 0xb649c5b35a759cf2
528	rk11 :
529	DQ 0x9af04e1eff82d0dd
530	rk12 :
531	DQ 0x6e82e609297f8fe8
532	rk13 :
533	DQ 0x97c516e98bd2e73
534	rk14 :
535	DQ 0xb76477b31e22e7b
536	rk15 :
537	DQ 0x5f6843ca540df020
538	rk16 :
539	DQ 0xddf4b6981205b83f
540	rk17 :
541	DQ 0x54819d8713758b2c
542	rk18 :
543	DQ 0x4a6b90073eb0af5a
544	rk19 :
545	DQ 0x571bee0a227ef92b
546	rk20 :
547	DQ 0x44bef2a201b5200c
548
549
550	mask1:
551	dq 0x8080808080808080, 0x8080808080808080
552	mask2:
553	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
554	mask3:
555	dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
556
557	SHUF_MASK:
558	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
559
560	pshufb_shf_table:
561	; use these values for shift constants for the pshufb instruction
562	; different alignments result in values as shown:
563	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
564	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
565	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
566	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
567	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
568	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
569	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
570	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
571	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
572	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
573	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
574	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
575	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
576	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
577	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
578	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
579	dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
580	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
581	dq 0x8080808080808080, 0x8080808080808080
582
583	;;; func core, ver, snum
584	slversion crc64_ecma_norm_by8, 01, 00, 001a