[ceph.git] / ceph / src / isa-l / crc / crc64_ecma_norm_by8.asm

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2016 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;       Function API:
;       uint64_t crc64_ecma_norm_by8(
;               uint64_t init_crc, //initial CRC value, 64 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               uint64_t len //buffer length in bytes (64-bit data)
;       );
;
;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
%include "reg_sizes.asm"

%define	fetch_dist	1024

[bits 64]
default rel

section .text

%ifidn __OUTPUT_FORMAT__, win64
        %xdefine        arg1 rcx
        %xdefine        arg2 rdx
        %xdefine        arg3 r8
%else
        %xdefine        arg1 rdi
        %xdefine        arg2 rsi
        %xdefine        arg3 rdx
%endif

%define TMP 16*0
%ifidn __OUTPUT_FORMAT__, win64
        %define XMM_SAVE 16*2
        %define VARIABLE_OFFSET 16*10+8
%else
        %define VARIABLE_OFFSET 16*2+8
%endif
align 16
global	crc64_ecma_norm_by8:ISAL_SYM_TYPE_FUNCTION
crc64_ecma_norm_by8:

	not	arg1      ;~init_crc

	sub	rsp,VARIABLE_OFFSET

%ifidn __OUTPUT_FORMAT__, win64
        ; push the xmm registers into the stack to maintain
        movdqa  [rsp + XMM_SAVE + 16*0], xmm6
        movdqa  [rsp + XMM_SAVE + 16*1], xmm7
        movdqa  [rsp + XMM_SAVE + 16*2], xmm8
        movdqa  [rsp + XMM_SAVE + 16*3], xmm9
        movdqa  [rsp + XMM_SAVE + 16*4], xmm10
        movdqa  [rsp + XMM_SAVE + 16*5], xmm11
        movdqa  [rsp + XMM_SAVE + 16*6], xmm12
        movdqa  [rsp + XMM_SAVE + 16*7], xmm13
%endif


	; check if smaller than 256
	cmp	arg3, 256

	; for sizes less than 256, we can't fold 128B at a time...
	jl	_less_than_256


	; load the initial crc value
	movq	xmm10, arg1	; initial crc

	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	; because data will be byte-reflected and will align with initial crc at correct place.
	pslldq	xmm10, 8

	movdqa xmm11, [SHUF_MASK]
	; receive the initial 128B data, xor the initial crc value
	movdqu	xmm0, [arg2+16*0]
	movdqu	xmm1, [arg2+16*1]
	movdqu	xmm2, [arg2+16*2]
	movdqu	xmm3, [arg2+16*3]
	movdqu	xmm4, [arg2+16*4]
	movdqu	xmm5, [arg2+16*5]
	movdqu	xmm6, [arg2+16*6]
	movdqu	xmm7, [arg2+16*7]

	pshufb	xmm0, xmm11
	; XOR the initial_crc value
	pxor	xmm0, xmm10
	pshufb	xmm1, xmm11
	pshufb	xmm2, xmm11
	pshufb	xmm3, xmm11
	pshufb	xmm4, xmm11
	pshufb	xmm5, xmm11
	pshufb	xmm6, xmm11
	pshufb	xmm7, xmm11

	movdqa	xmm10, [rk3]	;xmm10 has rk3 and rk4
					;imm value of pclmulqdq instruction will determine which constant to use
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	; we subtract 256 instead of 128 to save one instruction from the loop
	sub	arg3, 256

	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	; loop will fold 128B at a time until we have 128+y Bytes of buffer


	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
_fold_128_B_loop:

	; update the buffer pointer
	add	arg2, 128		;    buf += 128;

	prefetchnta [arg2+fetch_dist+0]
	movdqu	xmm9, [arg2+16*0]
	movdqu	xmm12, [arg2+16*1]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm0
	movdqa	xmm13, xmm1
	pclmulqdq	xmm0, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm1, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm0, xmm9
	xorps	xmm0, xmm8
	pxor	xmm1, xmm12
	xorps	xmm1, xmm13

	prefetchnta [arg2+fetch_dist+32]
	movdqu	xmm9, [arg2+16*2]
	movdqu	xmm12, [arg2+16*3]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm2
	movdqa	xmm13, xmm3
	pclmulqdq	xmm2, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm3, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm2, xmm9
	xorps	xmm2, xmm8
	pxor	xmm3, xmm12
	xorps	xmm3, xmm13

	prefetchnta [arg2+fetch_dist+64]
	movdqu	xmm9, [arg2+16*4]
	movdqu	xmm12, [arg2+16*5]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm4
	movdqa	xmm13, xmm5
	pclmulqdq	xmm4, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm5, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm4, xmm9
	xorps	xmm4, xmm8
	pxor	xmm5, xmm12
	xorps	xmm5, xmm13

	prefetchnta [arg2+fetch_dist+96]
	movdqu	xmm9, [arg2+16*6]
	movdqu	xmm12, [arg2+16*7]
	pshufb	xmm9, xmm11
	pshufb	xmm12, xmm11
	movdqa	xmm8, xmm6
	movdqa	xmm13, xmm7
	pclmulqdq	xmm6, xmm10, 0x0
	pclmulqdq	xmm8, xmm10 , 0x11
	pclmulqdq	xmm7, xmm10, 0x0
	pclmulqdq	xmm13, xmm10 , 0x11
	pxor	xmm6, xmm9
	xorps	xmm6, xmm8
	pxor	xmm7, xmm12
	xorps	xmm7, xmm13

	sub	arg3, 128

	; check if there is another 128B in the buffer to be able to fold
	jge	_fold_128_B_loop
	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

	add	arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7


	; fold the 8 xmm registers to 1 xmm register with different constants

	movdqa	xmm10, [rk9]
	movdqa	xmm8, xmm0
	pclmulqdq	xmm0, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm0

	movdqa	xmm10, [rk11]
	movdqa	xmm8, xmm1
	pclmulqdq	xmm1, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm1

	movdqa	xmm10, [rk13]
	movdqa	xmm8, xmm2
	pclmulqdq	xmm2, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

	movdqa	xmm10, [rk15]
	movdqa	xmm8, xmm3
	pclmulqdq	xmm3, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm3

	movdqa	xmm10, [rk17]
	movdqa	xmm8, xmm4
	pclmulqdq	xmm4, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm4

	movdqa	xmm10, [rk19]
	movdqa	xmm8, xmm5
	pclmulqdq	xmm5, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	xorps	xmm7, xmm5

	movdqa	xmm10, [rk1]	;xmm10 has rk1 and rk2

	movdqa	xmm8, xmm6
	pclmulqdq	xmm6, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm6


	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
	; instead of a cmp instruction, we use the negative flag with the jl instruction
	add	arg3, 128-16
	jl	_final_reduction_for_128

	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
	; we can fold 16 bytes at a time if y>=16
	; continue folding 16B at a time

_16B_reduction_loop:
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	movdqu	xmm0, [arg2]
	pshufb	xmm0, xmm11
	pxor	xmm7, xmm0
	add	arg2, 16
	sub	arg3, 16
	; instead of a cmp instruction, we utilize the flags with the jge instruction
	; equivalent of: cmp arg3, 16-16
	; check if there is any more 16B in the buffer to be able to fold
	jge	_16B_reduction_loop

	;now we have 16+z bytes left to reduce, where 0<= z < 16.
	;first, we reduce the data in the xmm7 register


_final_reduction_for_128:
	; check if any more data to fold. If not, compute the CRC of the final 128 bits
	add	arg3, 16
	je	_128_done

	; here we are getting data that is less than 16 bytes.
	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
	; after that the registers need to be adjusted.
_get_last_two_xmms:
	movdqa	xmm2, xmm7

	movdqu	xmm1, [arg2 - 16 + arg3]
	pshufb	xmm1, xmm11

	; get rid of the extra data that was loaded before
	; load the shift constant
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, arg3
	movdqu	xmm0, [rax]

	; shift xmm2 to the left by arg3 bytes
	pshufb	xmm2, xmm0

	; shift xmm7 to the right by 16-arg3 bytes
	pxor	xmm0, [mask1]
	pshufb	xmm7, xmm0
	pblendvb	xmm1, xmm2	;xmm0 is implicit

	; fold 16 Bytes
	movdqa	xmm2, xmm1
	movdqa	xmm8, xmm7
	pclmulqdq	xmm7, xmm10, 0x11
	pclmulqdq	xmm8, xmm10, 0x0
	pxor	xmm7, xmm8
	pxor	xmm7, xmm2

_128_done:
	; compute crc of a 128-bit value
	movdqa	xmm10, [rk5]	; rk5 and rk6 in xmm10
	movdqa	xmm0, xmm7

	;64b fold
	pclmulqdq	xmm7, xmm10, 0x01	; H*L
	pslldq	xmm0, 8
	pxor	xmm7, xmm0

	;barrett reduction
_barrett:
	movdqa	xmm10, [rk7]	; rk7 and rk8 in xmm10
	movdqa	xmm0, xmm7

	movdqa	xmm1, xmm7
        pand    xmm1, [mask3]
	pclmulqdq	xmm7, xmm10, 0x01
	pxor	xmm7, xmm1

	pclmulqdq	xmm7, xmm10, 0x11
	pxor	xmm7, xmm0
	pextrq	rax, xmm7, 0

_cleanup:
	not     rax
%ifidn __OUTPUT_FORMAT__, win64
        movdqa  xmm6, [rsp + XMM_SAVE + 16*0]
        movdqa  xmm7, [rsp + XMM_SAVE + 16*1]
        movdqa  xmm8, [rsp + XMM_SAVE + 16*2]
        movdqa  xmm9, [rsp + XMM_SAVE + 16*3]
        movdqa  xmm10, [rsp + XMM_SAVE + 16*4]
        movdqa  xmm11, [rsp + XMM_SAVE + 16*5]
        movdqa  xmm12, [rsp + XMM_SAVE + 16*6]
        movdqa  xmm13, [rsp + XMM_SAVE + 16*7]
%endif
	add	rsp, VARIABLE_OFFSET
	ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
_less_than_256:

	; check if there is enough buffer to be able to fold 16B at a time
	cmp	arg3, 32
	jl	_less_than_32
	movdqa xmm11, [SHUF_MASK]

	; if there is, load the constants
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10

	movq	xmm0, arg1	; get the initial crc value
	pslldq	xmm0, 8	; align it to its correct place
	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0


	; update the buffer pointer
	add	arg2, 16

	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
	sub	arg3, 32

	jmp	_16B_reduction_loop
align 16
_less_than_32:
	; mov initial crc to the return value. this is necessary for zero-length buffers.
	mov	rax, arg1
	test	arg3, arg3
	je	_cleanup

	movdqa xmm11, [SHUF_MASK]

	movq	xmm0, arg1	; get the initial crc value
	pslldq	xmm0, 8	; align it to its correct place

	cmp	arg3, 16
	je	_exact_16_left
	jl	_less_than_16_left

	movdqu	xmm7, [arg2]	; load the plaintext
	pshufb	xmm7, xmm11	; byte-reflect the plaintext
	pxor	xmm7, xmm0	; xor the initial crc value
	add	arg2, 16
	sub	arg3, 16
	movdqa	xmm10, [rk1]	; rk1 and rk2 in xmm10
	jmp	_get_last_two_xmms
align 16
_less_than_16_left:
	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
	pxor	xmm1, xmm1
	mov	r11, rsp
	movdqa	[r11], xmm1

	;	backup the counter value
	mov	r9, arg3
	cmp	arg3, 8
	jl	_less_than_8_left

	; load 8 Bytes
	mov	rax, [arg2]
	mov	[r11], rax
	add	r11, 8
	sub	arg3, 8
	add	arg2, 8
_less_than_8_left:

	cmp	arg3, 4
	jl	_less_than_4_left

	; load 4 Bytes
	mov	eax, [arg2]
	mov	[r11], eax
	add	r11, 4
	sub	arg3, 4
	add	arg2, 4
_less_than_4_left:

	cmp	arg3, 2
	jl	_less_than_2_left

	; load 2 Bytes
	mov	ax, [arg2]
	mov	[r11], ax
	add	r11, 2
	sub	arg3, 2
	add	arg2, 2
_less_than_2_left:
	cmp     arg3, 1
        jl      _zero_left

	; load 1 Byte
	mov	al, [arg2]
	mov	[r11], al
_zero_left:
	movdqa	xmm7, [rsp]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	; shl r9, 4
	lea	rax, [pshufb_shf_table + 16]
	sub	rax, r9

	cmp     r9, 8
        jl      _end_1to7

_end_8to15:
	movdqu	xmm0, [rax]
	pxor	xmm0, [mask1]

	pshufb	xmm7, xmm0
	jmp	_128_done

_end_1to7:
	; Right shift (8-length) bytes in XMM
	add	rax, 8
        movdqu  xmm0, [rax]
        pshufb  xmm7,xmm0

        jmp     _barrett
align 16
_exact_16_left:
	movdqu	xmm7, [arg2]
	pshufb	xmm7, xmm11
	pxor	xmm7, xmm0	; xor the initial crc value

	jmp	_128_done

section .data

; precomputed constants
align 16

rk1 :
DQ 0x5f5c3c7eb52fab6
rk2 :
DQ 0x4eb938a7d257740e
rk3 :
DQ 0x5cf79dea9ac37d6
rk4 :
DQ 0x001067e571d7d5c2
rk5 :
DQ 0x5f5c3c7eb52fab6
rk6 :
DQ 0x0000000000000000
rk7 :
DQ 0x578d29d06cc4f872
rk8 :
DQ 0x42f0e1eba9ea3693
rk9 :
DQ 0xe464f4df5fb60ac1
rk10 :
DQ 0xb649c5b35a759cf2
rk11 :
DQ 0x9af04e1eff82d0dd
rk12 :
DQ 0x6e82e609297f8fe8
rk13 :
DQ 0x97c516e98bd2e73
rk14 :
DQ 0xb76477b31e22e7b
rk15 :
DQ 0x5f6843ca540df020
rk16 :
DQ 0xddf4b6981205b83f
rk17 :
DQ 0x54819d8713758b2c
rk18 :
DQ 0x4a6b90073eb0af5a
rk19 :
DQ 0x571bee0a227ef92b
rk20 :
DQ 0x44bef2a201b5200c


mask1:
dq 0x8080808080808080, 0x8080808080808080
mask2:
dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
mask3:
dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF

SHUF_MASK:
dq 0x08090A0B0C0D0E0F, 0x0001020304050607

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
dq 0x8080808080808080, 0x8080808080808080

;;;       func        core, ver, snum
slversion crc64_ecma_norm_by8, 01,   00,  001a
Commit	Line	Data
224ce89b WB	1	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	2	; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
	3	;
	4	; Redistribution and use in source and binary forms, with or without
	5	; modification, are permitted provided that the following conditions
	6	; are met:
	7	; * Redistributions of source code must retain the above copyright
	8	; notice, this list of conditions and the following disclaimer.
	9	; * Redistributions in binary form must reproduce the above copyright
	10	; notice, this list of conditions and the following disclaimer in
	11	; the documentation and/or other materials provided with the
	12	; distribution.
	13	; * Neither the name of Intel Corporation nor the names of its
	14	; contributors may be used to endorse or promote products derived
	15	; from this software without specific prior written permission.
	16	;
	17	; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	18	; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	19	; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
	20	; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
	21	; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	22	; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
	23	; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	24	; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	25	; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	26	; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	27	; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	28	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	29
	30	; Function API:
	31	; uint64_t crc64_ecma_norm_by8(
	32	; uint64_t init_crc, //initial CRC value, 64 bits
	33	; const unsigned char *buf, //buffer pointer to calculate CRC on
	34	; uint64_t len //buffer length in bytes (64-bit data)
	35	; );
	36	;
	37	; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
	38	%include "reg_sizes.asm"
	39
	40	%define fetch_dist 1024
	41
	42	[bits 64]
	43	default rel
	44
	45	section .text
	46
	47	%ifidn __OUTPUT_FORMAT__, win64
	48	%xdefine arg1 rcx
	49	%xdefine arg2 rdx
	50	%xdefine arg3 r8
	51	%else
	52	%xdefine arg1 rdi
	53	%xdefine arg2 rsi
	54	%xdefine arg3 rdx
	55	%endif
	56
	57	%define TMP 16*0
	58	%ifidn __OUTPUT_FORMAT__, win64
	59	%define XMM_SAVE 16*2
	60	%define VARIABLE_OFFSET 16*10+8
	61	%else
	62	%define VARIABLE_OFFSET 16*2+8
	63	%endif
	64	align 16
f91f0fd5	65	global crc64_ecma_norm_by8:ISAL_SYM_TYPE_FUNCTION
224ce89b WB	66	crc64_ecma_norm_by8:
	67
	68	not arg1 ;~init_crc
	69
	70	sub rsp,VARIABLE_OFFSET
	71
	72	%ifidn __OUTPUT_FORMAT__, win64
	73	; push the xmm registers into the stack to maintain
	74	movdqa [rsp + XMM_SAVE + 16*0], xmm6
	75	movdqa [rsp + XMM_SAVE + 16*1], xmm7
	76	movdqa [rsp + XMM_SAVE + 16*2], xmm8
	77	movdqa [rsp + XMM_SAVE + 16*3], xmm9
	78	movdqa [rsp + XMM_SAVE + 16*4], xmm10
	79	movdqa [rsp + XMM_SAVE + 16*5], xmm11
	80	movdqa [rsp + XMM_SAVE + 16*6], xmm12
	81	movdqa [rsp + XMM_SAVE + 16*7], xmm13
	82	%endif
	83
	84
	85	; check if smaller than 256
	86	cmp arg3, 256
	87
	88	; for sizes less than 256, we can't fold 128B at a time...
	89	jl _less_than_256
	90
	91
	92	; load the initial crc value
	93	movq xmm10, arg1 ; initial crc
	94
	95	; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
	96	; because data will be byte-reflected and will align with initial crc at correct place.
	97	pslldq xmm10, 8
	98
	99	movdqa xmm11, [SHUF_MASK]
	100	; receive the initial 128B data, xor the initial crc value
	101	movdqu xmm0, [arg2+16*0]
	102	movdqu xmm1, [arg2+16*1]
	103	movdqu xmm2, [arg2+16*2]
	104	movdqu xmm3, [arg2+16*3]
	105	movdqu xmm4, [arg2+16*4]
	106	movdqu xmm5, [arg2+16*5]
	107	movdqu xmm6, [arg2+16*6]
	108	movdqu xmm7, [arg2+16*7]
	109
	110	pshufb xmm0, xmm11
	111	; XOR the initial_crc value
	112	pxor xmm0, xmm10
	113	pshufb xmm1, xmm11
	114	pshufb xmm2, xmm11
	115	pshufb xmm3, xmm11
	116	pshufb xmm4, xmm11
	117	pshufb xmm5, xmm11
	118	pshufb xmm6, xmm11
	119	pshufb xmm7, xmm11
	120
	121	movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
	122	;imm value of pclmulqdq instruction will determine which constant to use
	123	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
	124	; we subtract 256 instead of 128 to save one instruction from the loop
	125	sub arg3, 256
	126
	127	; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
	128	; loop will fold 128B at a time until we have 128+y Bytes of buffer
	129
130
131	; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
132	_fold_128_B_loop:
133
134	; update the buffer pointer
135	add arg2, 128 ; buf += 128;
136
137	prefetchnta [arg2+fetch_dist+0]
138	movdqu xmm9, [arg2+16*0]
139	movdqu xmm12, [arg2+16*1]
140	pshufb xmm9, xmm11
141	pshufb xmm12, xmm11
142	movdqa xmm8, xmm0
143	movdqa xmm13, xmm1
144	pclmulqdq xmm0, xmm10, 0x0
145	pclmulqdq xmm8, xmm10 , 0x11
146	pclmulqdq xmm1, xmm10, 0x0
147	pclmulqdq xmm13, xmm10 , 0x11
148	pxor xmm0, xmm9
149	xorps xmm0, xmm8
150	pxor xmm1, xmm12
151	xorps xmm1, xmm13
152
153	prefetchnta [arg2+fetch_dist+32]
154	movdqu xmm9, [arg2+16*2]
155	movdqu xmm12, [arg2+16*3]
156	pshufb xmm9, xmm11
157	pshufb xmm12, xmm11
158	movdqa xmm8, xmm2
159	movdqa xmm13, xmm3
160	pclmulqdq xmm2, xmm10, 0x0
161	pclmulqdq xmm8, xmm10 , 0x11
162	pclmulqdq xmm3, xmm10, 0x0
163	pclmulqdq xmm13, xmm10 , 0x11
164	pxor xmm2, xmm9
165	xorps xmm2, xmm8
166	pxor xmm3, xmm12
167	xorps xmm3, xmm13
168
169	prefetchnta [arg2+fetch_dist+64]
170	movdqu xmm9, [arg2+16*4]
171	movdqu xmm12, [arg2+16*5]
172	pshufb xmm9, xmm11
173	pshufb xmm12, xmm11
174	movdqa xmm8, xmm4
175	movdqa xmm13, xmm5
176	pclmulqdq xmm4, xmm10, 0x0
177	pclmulqdq xmm8, xmm10 , 0x11
178	pclmulqdq xmm5, xmm10, 0x0
179	pclmulqdq xmm13, xmm10 , 0x11
180	pxor xmm4, xmm9
181	xorps xmm4, xmm8
182	pxor xmm5, xmm12
183	xorps xmm5, xmm13
184
185	prefetchnta [arg2+fetch_dist+96]
186	movdqu xmm9, [arg2+16*6]
187	movdqu xmm12, [arg2+16*7]
188	pshufb xmm9, xmm11
189	pshufb xmm12, xmm11
190	movdqa xmm8, xmm6
191	movdqa xmm13, xmm7
192	pclmulqdq xmm6, xmm10, 0x0
193	pclmulqdq xmm8, xmm10 , 0x11
194	pclmulqdq xmm7, xmm10, 0x0
195	pclmulqdq xmm13, xmm10 , 0x11
196	pxor xmm6, xmm9
197	xorps xmm6, xmm8
198	pxor xmm7, xmm12
199	xorps xmm7, xmm13
200
201	sub arg3, 128
202
203	; check if there is another 128B in the buffer to be able to fold
204	jge _fold_128_B_loop
205	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
206
207	add arg2, 128
208	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
209	; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
210
211
212	; fold the 8 xmm registers to 1 xmm register with different constants
213
214	movdqa xmm10, [rk9]
215	movdqa xmm8, xmm0
216	pclmulqdq xmm0, xmm10, 0x11
217	pclmulqdq xmm8, xmm10, 0x0
218	pxor xmm7, xmm8
219	xorps xmm7, xmm0
220
221	movdqa xmm10, [rk11]
222	movdqa xmm8, xmm1
223	pclmulqdq xmm1, xmm10, 0x11
224	pclmulqdq xmm8, xmm10, 0x0
225	pxor xmm7, xmm8
226	xorps xmm7, xmm1
227
228	movdqa xmm10, [rk13]
229	movdqa xmm8, xmm2
230	pclmulqdq xmm2, xmm10, 0x11
231	pclmulqdq xmm8, xmm10, 0x0
232	pxor xmm7, xmm8
233	pxor xmm7, xmm2
234
235	movdqa xmm10, [rk15]
236	movdqa xmm8, xmm3
237	pclmulqdq xmm3, xmm10, 0x11
238	pclmulqdq xmm8, xmm10, 0x0
239	pxor xmm7, xmm8
240	xorps xmm7, xmm3
241
242	movdqa xmm10, [rk17]
243	movdqa xmm8, xmm4
244	pclmulqdq xmm4, xmm10, 0x11
245	pclmulqdq xmm8, xmm10, 0x0
246	pxor xmm7, xmm8
247	pxor xmm7, xmm4
248
249	movdqa xmm10, [rk19]
250	movdqa xmm8, xmm5
251	pclmulqdq xmm5, xmm10, 0x11
252	pclmulqdq xmm8, xmm10, 0x0
253	pxor xmm7, xmm8
254	xorps xmm7, xmm5
255
256	movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
257
258	movdqa xmm8, xmm6
259	pclmulqdq xmm6, xmm10, 0x11
260	pclmulqdq xmm8, xmm10, 0x0
261	pxor xmm7, xmm8
262	pxor xmm7, xmm6
263
264
265	; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
266	; instead of a cmp instruction, we use the negative flag with the jl instruction
267	add arg3, 128-16
268	jl _final_reduction_for_128
269
270	; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
271	; we can fold 16 bytes at a time if y>=16
272	; continue folding 16B at a time
273
274	_16B_reduction_loop:
275	movdqa xmm8, xmm7
276	pclmulqdq xmm7, xmm10, 0x11
277	pclmulqdq xmm8, xmm10, 0x0
278	pxor xmm7, xmm8
279	movdqu xmm0, [arg2]
280	pshufb xmm0, xmm11
281	pxor xmm7, xmm0
282	add arg2, 16
283	sub arg3, 16
284	; instead of a cmp instruction, we utilize the flags with the jge instruction
285	; equivalent of: cmp arg3, 16-16
286	; check if there is any more 16B in the buffer to be able to fold
287	jge _16B_reduction_loop
288
289	;now we have 16+z bytes left to reduce, where 0<= z < 16.
290	;first, we reduce the data in the xmm7 register
291
292
293	_final_reduction_for_128:
294	; check if any more data to fold. If not, compute the CRC of the final 128 bits
295	add arg3, 16
296	je _128_done
297
298	; here we are getting data that is less than 16 bytes.
299	; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
300	; after that the registers need to be adjusted.
301	_get_last_two_xmms:
302	movdqa xmm2, xmm7
303
304	movdqu xmm1, [arg2 - 16 + arg3]
305	pshufb xmm1, xmm11
306
307	; get rid of the extra data that was loaded before
308	; load the shift constant
309	lea rax, [pshufb_shf_table + 16]
310	sub rax, arg3
311	movdqu xmm0, [rax]
312
313	; shift xmm2 to the left by arg3 bytes
314	pshufb xmm2, xmm0
315
316	; shift xmm7 to the right by 16-arg3 bytes
317	pxor xmm0, [mask1]
318	pshufb xmm7, xmm0
319	pblendvb xmm1, xmm2 ;xmm0 is implicit
320
321	; fold 16 Bytes
322	movdqa xmm2, xmm1
323	movdqa xmm8, xmm7
324	pclmulqdq xmm7, xmm10, 0x11
325	pclmulqdq xmm8, xmm10, 0x0
326	pxor xmm7, xmm8
327	pxor xmm7, xmm2
328
329	_128_done:
330	; compute crc of a 128-bit value
331	movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
332	movdqa xmm0, xmm7
333
334	;64b fold
335	pclmulqdq xmm7, xmm10, 0x01 ; H*L
336	pslldq xmm0, 8
337	pxor xmm7, xmm0
338
339	;barrett reduction
340	_barrett:
341	movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
342	movdqa xmm0, xmm7
343
344	movdqa xmm1, xmm7
345	pand xmm1, [mask3]
346	pclmulqdq xmm7, xmm10, 0x01
347	pxor xmm7, xmm1
348
349	pclmulqdq xmm7, xmm10, 0x11
350	pxor xmm7, xmm0
351	pextrq rax, xmm7, 0
352
353	_cleanup:
354	not rax
355	%ifidn __OUTPUT_FORMAT__, win64
356	movdqa xmm6, [rsp + XMM_SAVE + 16*0]
357	movdqa xmm7, [rsp + XMM_SAVE + 16*1]
358	movdqa xmm8, [rsp + XMM_SAVE + 16*2]
359	movdqa xmm9, [rsp + XMM_SAVE + 16*3]
360	movdqa xmm10, [rsp + XMM_SAVE + 16*4]
361	movdqa xmm11, [rsp + XMM_SAVE + 16*5]
362	movdqa xmm12, [rsp + XMM_SAVE + 16*6]
363	movdqa xmm13, [rsp + XMM_SAVE + 16*7]
364	%endif
365	add rsp, VARIABLE_OFFSET
366	ret
367
368	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
369	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
370	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
371	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
372
373	align 16
374	_less_than_256:
375
376	; check if there is enough buffer to be able to fold 16B at a time
377	cmp arg3, 32
378	jl _less_than_32
379	movdqa xmm11, [SHUF_MASK]
380
381	; if there is, load the constants
382	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
383
384	movq xmm0, arg1 ; get the initial crc value
385	pslldq xmm0, 8 ; align it to its correct place
386	movdqu xmm7, [arg2] ; load the plaintext
387	pshufb xmm7, xmm11 ; byte-reflect the plaintext
388	pxor xmm7, xmm0
389
390
391	; update the buffer pointer
392	add arg2, 16
393
394	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
395	sub arg3, 32
396
397	jmp _16B_reduction_loop
398	align 16
399	_less_than_32:
400	; mov initial crc to the return value. this is necessary for zero-length buffers.
401	mov rax, arg1
402	test arg3, arg3
403	je _cleanup
404
405	movdqa xmm11, [SHUF_MASK]
406
407	movq xmm0, arg1 ; get the initial crc value
408	pslldq xmm0, 8 ; align it to its correct place
409
410	cmp arg3, 16
411	je _exact_16_left
412	jl _less_than_16_left
413
414	movdqu xmm7, [arg2] ; load the plaintext
415	pshufb xmm7, xmm11 ; byte-reflect the plaintext
416	pxor xmm7, xmm0 ; xor the initial crc value
417	add arg2, 16
418	sub arg3, 16
419	movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
420	jmp _get_last_two_xmms
421	align 16
422	_less_than_16_left:
423	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
424	pxor xmm1, xmm1
425	mov r11, rsp
426	movdqa [r11], xmm1
427
428	; backup the counter value
429	mov r9, arg3
430	cmp arg3, 8
431	jl _less_than_8_left
432
433	; load 8 Bytes
434	mov rax, [arg2]
435	mov [r11], rax
436	add r11, 8
437	sub arg3, 8
438	add arg2, 8
439	_less_than_8_left:
440
441	cmp arg3, 4
442	jl _less_than_4_left
443
444	; load 4 Bytes
445	mov eax, [arg2]
446	mov [r11], eax
447	add r11, 4
448	sub arg3, 4
449	add arg2, 4
450	_less_than_4_left:
451
452	cmp arg3, 2
453	jl _less_than_2_left
454
455	; load 2 Bytes
456	mov ax, [arg2]
457	mov [r11], ax
458	add r11, 2
459	sub arg3, 2
460	add arg2, 2
461	_less_than_2_left:
462	cmp arg3, 1
463	jl _zero_left
464
465	; load 1 Byte
466	mov al, [arg2]
467	mov [r11], al
468	_zero_left:
469	movdqa xmm7, [rsp]
470	pshufb xmm7, xmm11
471	pxor xmm7, xmm0 ; xor the initial crc value
472
473	; shl r9, 4
474	lea rax, [pshufb_shf_table + 16]
475	sub rax, r9
476
477	cmp r9, 8
478	jl _end_1to7
479
480	_end_8to15:
481	movdqu xmm0, [rax]
482	pxor xmm0, [mask1]
483
484	pshufb xmm7, xmm0
485	jmp _128_done
486
487	_end_1to7:
488	; Right shift (8-length) bytes in XMM
489	add rax, 8
490	movdqu xmm0, [rax]
491	pshufb xmm7,xmm0
492
493	jmp _barrett
494	align 16
495	_exact_16_left:
496	movdqu xmm7, [arg2]
497	pshufb xmm7, xmm11
498	pxor xmm7, xmm0 ; xor the initial crc value
499
500	jmp _128_done
501
502	section .data
503
504	; precomputed constants
505	align 16
506
507	rk1 :
508	DQ 0x5f5c3c7eb52fab6
509	rk2 :
510	DQ 0x4eb938a7d257740e
511	rk3 :
512	DQ 0x5cf79dea9ac37d6
513	rk4 :
514	DQ 0x001067e571d7d5c2
515	rk5 :
516	DQ 0x5f5c3c7eb52fab6
517	rk6 :
518	DQ 0x0000000000000000
519	rk7 :
520	DQ 0x578d29d06cc4f872
521	rk8 :
522	DQ 0x42f0e1eba9ea3693
523	rk9 :
524	DQ 0xe464f4df5fb60ac1
525	rk10 :
526	DQ 0xb649c5b35a759cf2
527	rk11 :
528	DQ 0x9af04e1eff82d0dd
529	rk12 :
530	DQ 0x6e82e609297f8fe8
531	rk13 :
532	DQ 0x97c516e98bd2e73
533	rk14 :
534	DQ 0xb76477b31e22e7b
535	rk15 :
536	DQ 0x5f6843ca540df020
537	rk16 :
538	DQ 0xddf4b6981205b83f
539	rk17 :
540	DQ 0x54819d8713758b2c
541	rk18 :
542	DQ 0x4a6b90073eb0af5a
543	rk19 :
544	DQ 0x571bee0a227ef92b
545	rk20 :
546	DQ 0x44bef2a201b5200c
547
548
549	mask1:
550	dq 0x8080808080808080, 0x8080808080808080
551	mask2:
552	dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
553	mask3:
554	dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
555
556	SHUF_MASK:
557	dq 0x08090A0B0C0D0E0F, 0x0001020304050607
558
559	pshufb_shf_table:
560	; use these values for shift constants for the pshufb instruction
561	; different alignments result in values as shown:
562	; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
563	; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
564	; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
565	; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
566	; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
567	; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
568	; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
569	; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
570	; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
571	; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
572	; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
573	; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
574	; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
575	; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
576	; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
577	dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
578	dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
579	dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
580	dq 0x8080808080808080, 0x8080808080808080
581
582	;;; func core, ver, snum
583	slversion crc64_ecma_norm_by8, 01, 00, 001a