[ceph.git] / ceph / src / common / crc32c_ppc_asm.S

/*
 * Calculate the checksum of data that is 16 byte aligned and a multiple of
 * 16 bytes.
 *
 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
 * chunks in order to mask the latency of the vpmsum instructions. If we
 * have more than 32 kB of data to checksum we repeat this step multiple
 * times, passing in the previous 1024 bits.
 *
 * The next step is to reduce the 1024 bits to 64 bits. This step adds
 * 32 bits of 0s to the end - this matches what a CRC does. We just
 * calculate constants that land the data in this 32 bits.
 *
 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
 * for n = CRC using POWER8 instructions. We use x = 32.
 *
 * http://en.wikipedia.org/wiki/Barrett_reduction
 *
 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of either:
 *
 *  a) the GNU General Public License as published by the Free Software
 *     Foundation; either version 2 of the License, or (at your option)
 *     any later version, or
 *  b) the Apache License, Version 2.0
 */

#if defined (__clang__)
#ifndef __ALTIVEC__
#define __ALTIVEC__
#endif
#include "ppc-asm.h"
#else
#include <ppc-asm.h>
#endif
#include "ppc-opcode.h"

#undef toc

#ifndef r1
#define r1 1
#endif

#ifndef r2
#define r2 2
#endif

	.section	.rodata
.balign 16

.byteswap_constant:
	/* byte reverse permute constant */
	.octa 0x0F0E0D0C0B0A09080706050403020100

#ifdef CRC32_CONSTANTS_HEADER
#include CRC32_CONSTANTS_HEADER
#else
#include "crc32c_ppc_constants.h"
#endif

	.text

#if defined(__BIG_ENDIAN__) && defined(REFLECT)
#define BYTESWAP_DATA
#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
#define BYTESWAP_DATA
#else
#undef BYTESWAP_DATA
#endif

#define off16		r25
#define off32		r26
#define off48		r27
#define off64		r28
#define off80		r29
#define off96		r30
#define off112		r31

#define const1		v24
#define const2		v25

#define byteswap	v26
#define	mask_32bit	v27
#define	mask_64bit	v28
#define zeroes		v29

#ifdef BYTESWAP_DATA
#define VPERM(A, B, C, D) vperm	A, B, C, D
#else
#define VPERM(A, B, C, D)
#endif

#ifndef CRC32_FUNCTION_ASM
#define CRC32_FUNCTION_ASM __crc32_vpmsum
#endif

/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
FUNC_START(CRC32_FUNCTION_ASM)
	std	r31,-8(r1)
	std	r30,-16(r1)
	std	r29,-24(r1)
	std	r28,-32(r1)
	std	r27,-40(r1)
	std	r26,-48(r1)
	std	r25,-56(r1)

	li	off16,16
	li	off32,32
	li	off48,48
	li	off64,64
	li	off80,80
	li	off96,96
	li	off112,112
	li	r0,0

	/* Enough room for saving 10 non volatile VMX registers */
	subi	r6,r1,56+10*16
	subi	r7,r1,56+2*16

	stvx	v20,0,r6
	stvx	v21,off16,r6
	stvx	v22,off32,r6
	stvx	v23,off48,r6
	stvx	v24,off64,r6
	stvx	v25,off80,r6
	stvx	v26,off96,r6
	stvx	v27,off112,r6
	stvx	v28,0,r7
	stvx	v29,off16,r7

	mr	r10,r3

	vxor	zeroes,zeroes,zeroes
	vspltisw v0,-1

	vsldoi	mask_32bit,zeroes,v0,4
	vsldoi	mask_64bit,zeroes,v0,8

	/* Get the initial value into v8 */
	vxor	v8,v8,v8
	MTVRD(v8, r3)
#ifdef REFLECT
	vsldoi	v8,zeroes,v8,8	/* shift into bottom 32 bits */
#else
	vsldoi	v8,v8,zeroes,4	/* shift into top 32 bits */
#endif

#ifdef BYTESWAP_DATA
	addis	r3,r2,.byteswap_constant@toc@ha
	addi	r3,r3,.byteswap_constant@toc@l

	lvx	byteswap,0,r3
	addi	r3,r3,16
#endif

	cmpdi	r5,256
	blt	.Lshort

	rldicr	r6,r5,0,56

	/* Checksum in blocks of MAX_SIZE */
1:	lis	r7,MAX_SIZE@h
	ori	r7,r7,MAX_SIZE@l
	mr	r9,r7
	cmpd	r6,r7
	bgt	2f
	mr	r7,r6
2:	subf	r6,r7,r6

	/* our main loop does 128 bytes at a time */
	srdi	r7,r7,7

	/*
	 * Work out the offset into the constants table to start at. Each
	 * constant is 16 bytes, and it is used against 128 bytes of input
	 * data - 128 / 16 = 8
	 */
	sldi	r8,r7,4
	srdi	r9,r9,3
	subf	r8,r8,r9

	/* We reduce our final 128 bytes in a separate step */
	addi	r7,r7,-1
	mtctr	r7

	addis	r3,r2,.constants@toc@ha
	addi	r3,r3,.constants@toc@l

	/* Find the start of our constants */
	add	r3,r3,r8

	/* zero v0-v7 which will contain our checksums */
	vxor	v0,v0,v0
	vxor	v1,v1,v1
	vxor	v2,v2,v2
	vxor	v3,v3,v3
	vxor	v4,v4,v4
	vxor	v5,v5,v5
	vxor	v6,v6,v6
	vxor	v7,v7,v7

	lvx	const1,0,r3

	/*
	 * If we are looping back to consume more data we use the values
	 * already in v16-v23.
	 */
	cmpdi	r0,1
	beq	2f

	/* First warm up pass */
	lvx	v16,0,r4
	lvx	v17,off16,r4
	VPERM(v16,v16,v16,byteswap)
	VPERM(v17,v17,v17,byteswap)
	lvx	v18,off32,r4
	lvx	v19,off48,r4
	VPERM(v18,v18,v18,byteswap)
	VPERM(v19,v19,v19,byteswap)
	lvx	v20,off64,r4
	lvx	v21,off80,r4
	VPERM(v20,v20,v20,byteswap)
	VPERM(v21,v21,v21,byteswap)
	lvx	v22,off96,r4
	lvx	v23,off112,r4
	VPERM(v22,v22,v22,byteswap)
	VPERM(v23,v23,v23,byteswap)
	addi	r4,r4,8*16

	/* xor in initial value */
	vxor	v16,v16,v8

2:	bdz	.Lfirst_warm_up_done

	addi	r3,r3,16
	lvx	const2,0,r3

	/* Second warm up pass */
	VPMSUMD(v8,v16,const1)
	lvx	v16,0,r4
	VPERM(v16,v16,v16,byteswap)
	ori	r2,r2,0

	VPMSUMD(v9,v17,const1)
	lvx	v17,off16,r4
	VPERM(v17,v17,v17,byteswap)
	ori	r2,r2,0

	VPMSUMD(v10,v18,const1)
	lvx	v18,off32,r4
	VPERM(v18,v18,v18,byteswap)
	ori	r2,r2,0

	VPMSUMD(v11,v19,const1)
	lvx	v19,off48,r4
	VPERM(v19,v19,v19,byteswap)
	ori	r2,r2,0

	VPMSUMD(v12,v20,const1)
	lvx	v20,off64,r4
	VPERM(v20,v20,v20,byteswap)
	ori	r2,r2,0

	VPMSUMD(v13,v21,const1)
	lvx	v21,off80,r4
	VPERM(v21,v21,v21,byteswap)
	ori	r2,r2,0

	VPMSUMD(v14,v22,const1)
	lvx	v22,off96,r4
	VPERM(v22,v22,v22,byteswap)
	ori	r2,r2,0

	VPMSUMD(v15,v23,const1)
	lvx	v23,off112,r4
	VPERM(v23,v23,v23,byteswap)

	addi	r4,r4,8*16

	bdz	.Lfirst_cool_down

	/*
	 * main loop. We modulo schedule it such that it takes three iterations
	 * to complete - first iteration load, second iteration vpmsum, third
	 * iteration xor.
	 */
	.balign	16
4:	lvx	const1,0,r3
	addi	r3,r3,16
	ori	r2,r2,0

	vxor	v0,v0,v8
	VPMSUMD(v8,v16,const2)
	lvx	v16,0,r4
	VPERM(v16,v16,v16,byteswap)
	ori	r2,r2,0

	vxor	v1,v1,v9
	VPMSUMD(v9,v17,const2)
	lvx	v17,off16,r4
	VPERM(v17,v17,v17,byteswap)
	ori	r2,r2,0

	vxor	v2,v2,v10
	VPMSUMD(v10,v18,const2)
	lvx	v18,off32,r4
	VPERM(v18,v18,v18,byteswap)
	ori	r2,r2,0

	vxor	v3,v3,v11
	VPMSUMD(v11,v19,const2)
	lvx	v19,off48,r4
	VPERM(v19,v19,v19,byteswap)
	lvx	const2,0,r3
	ori	r2,r2,0

	vxor	v4,v4,v12
	VPMSUMD(v12,v20,const1)
	lvx	v20,off64,r4
	VPERM(v20,v20,v20,byteswap)
	ori	r2,r2,0

	vxor	v5,v5,v13
	VPMSUMD(v13,v21,const1)
	lvx	v21,off80,r4
	VPERM(v21,v21,v21,byteswap)
	ori	r2,r2,0

	vxor	v6,v6,v14
	VPMSUMD(v14,v22,const1)
	lvx	v22,off96,r4
	VPERM(v22,v22,v22,byteswap)
	ori	r2,r2,0

	vxor	v7,v7,v15
	VPMSUMD(v15,v23,const1)
	lvx	v23,off112,r4
	VPERM(v23,v23,v23,byteswap)

	addi	r4,r4,8*16

	bdnz	4b

.Lfirst_cool_down:
	/* First cool down pass */
	lvx	const1,0,r3
	addi	r3,r3,16

	vxor	v0,v0,v8
	VPMSUMD(v8,v16,const1)
	ori	r2,r2,0

	vxor	v1,v1,v9
	VPMSUMD(v9,v17,const1)
	ori	r2,r2,0

	vxor	v2,v2,v10
	VPMSUMD(v10,v18,const1)
	ori	r2,r2,0

	vxor	v3,v3,v11
	VPMSUMD(v11,v19,const1)
	ori	r2,r2,0

	vxor	v4,v4,v12
	VPMSUMD(v12,v20,const1)
	ori	r2,r2,0

	vxor	v5,v5,v13
	VPMSUMD(v13,v21,const1)
	ori	r2,r2,0

	vxor	v6,v6,v14
	VPMSUMD(v14,v22,const1)
	ori	r2,r2,0

	vxor	v7,v7,v15
	VPMSUMD(v15,v23,const1)
	ori	r2,r2,0

.Lsecond_cool_down:
	/* Second cool down pass */
	vxor	v0,v0,v8
	vxor	v1,v1,v9
	vxor	v2,v2,v10
	vxor	v3,v3,v11
	vxor	v4,v4,v12
	vxor	v5,v5,v13
	vxor	v6,v6,v14
	vxor	v7,v7,v15

#ifdef REFLECT
	/*
	 * vpmsumd produces a 96 bit result in the least significant bits
	 * of the register. Since we are bit reflected we have to shift it
	 * left 32 bits so it occupies the least significant bits in the
	 * bit reflected domain.
	 */
	vsldoi	v0,v0,zeroes,4
	vsldoi	v1,v1,zeroes,4
	vsldoi	v2,v2,zeroes,4
	vsldoi	v3,v3,zeroes,4
	vsldoi	v4,v4,zeroes,4
	vsldoi	v5,v5,zeroes,4
	vsldoi	v6,v6,zeroes,4
	vsldoi	v7,v7,zeroes,4
#endif

	/* xor with last 1024 bits */
	lvx	v8,0,r4
	lvx	v9,off16,r4
	VPERM(v8,v8,v8,byteswap)
	VPERM(v9,v9,v9,byteswap)
	lvx	v10,off32,r4
	lvx	v11,off48,r4
	VPERM(v10,v10,v10,byteswap)
	VPERM(v11,v11,v11,byteswap)
	lvx	v12,off64,r4
	lvx	v13,off80,r4
	VPERM(v12,v12,v12,byteswap)
	VPERM(v13,v13,v13,byteswap)
	lvx	v14,off96,r4
	lvx	v15,off112,r4
	VPERM(v14,v14,v14,byteswap)
	VPERM(v15,v15,v15,byteswap)

	addi	r4,r4,8*16

	vxor	v16,v0,v8
	vxor	v17,v1,v9
	vxor	v18,v2,v10
	vxor	v19,v3,v11
	vxor	v20,v4,v12
	vxor	v21,v5,v13
	vxor	v22,v6,v14
	vxor	v23,v7,v15

	li	r0,1
	cmpdi	r6,0
	addi	r6,r6,128
	bne	1b

	/* Work out how many bytes we have left */
	andi.	r5,r5,127

	/* Calculate where in the constant table we need to start */
	subfic	r6,r5,128
	add	r3,r3,r6

	/* How many 16 byte chunks are in the tail */
	srdi	r7,r5,4
	mtctr	r7

	/*
	 * Reduce the previously calculated 1024 bits to 64 bits, shifting
	 * 32 bits to include the trailing 32 bits of zeros
	 */
	lvx	v0,0,r3
	lvx	v1,off16,r3
	lvx	v2,off32,r3
	lvx	v3,off48,r3
	lvx	v4,off64,r3
	lvx	v5,off80,r3
	lvx	v6,off96,r3
	lvx	v7,off112,r3
	addi	r3,r3,8*16

	VPMSUMW(v0,v16,v0)
	VPMSUMW(v1,v17,v1)
	VPMSUMW(v2,v18,v2)
	VPMSUMW(v3,v19,v3)
	VPMSUMW(v4,v20,v4)
	VPMSUMW(v5,v21,v5)
	VPMSUMW(v6,v22,v6)
	VPMSUMW(v7,v23,v7)

	/* Now reduce the tail (0 - 112 bytes) */
	cmpdi	r7,0
	beq	1f

	lvx	v16,0,r4
	lvx	v17,0,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off16,r4
	lvx	v17,off16,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off32,r4
	lvx	v17,off32,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off48,r4
	lvx	v17,off48,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off64,r4
	lvx	v17,off64,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off80,r4
	lvx	v17,off80,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16
	bdz	1f

	lvx	v16,off96,r4
	lvx	v17,off96,r3
	VPERM(v16,v16,v16,byteswap)
	VPMSUMW(v16,v16,v17)
	vxor	v0,v0,v16

	/* Now xor all the parallel chunks together */
1:	vxor	v0,v0,v1
	vxor	v2,v2,v3
	vxor	v4,v4,v5
	vxor	v6,v6,v7

	vxor	v0,v0,v2
	vxor	v4,v4,v6

	vxor	v0,v0,v4

.Lbarrett_reduction:
	/* Barrett constants */
	addis	r3,r2,.barrett_constants@toc@ha
	addi	r3,r3,.barrett_constants@toc@l

	lvx	const1,0,r3
	lvx	const2,off16,r3

	vsldoi	v1,v0,v0,8
	vxor	v0,v0,v1		/* xor two 64 bit results together */

#ifdef REFLECT
	/* shift left one bit */
	vspltisb v1,1
	vsl	v0,v0,v1
#endif

	vand	v0,v0,mask_64bit

#ifndef REFLECT
	/*
	 * Now for the Barrett reduction algorithm. The idea is to calculate q,
	 * the multiple of our polynomial that we need to subtract. By
	 * doing the computation 2x bits higher (ie 64 bits) and shifting the
	 * result back down 2x bits, we round down to the nearest multiple.
	 */
	VPMSUMD(v1,v0,const1)	/* ma */
	vsldoi	v1,zeroes,v1,8	/* q = floor(ma/(2^64)) */
	VPMSUMD(v1,v1,const2)	/* qn */
	vxor	v0,v0,v1	/* a - qn, subtraction is xor in GF(2) */

	/*
	 * Get the result into r3. We need to shift it left 8 bytes:
	 * V0 [ 0 1 2 X ]
	 * V0 [ 0 X 2 3 ]
	 */
	vsldoi	v0,v0,zeroes,8	/* shift result into top 64 bits */
#else
	/*
	 * The reflected version of Barrett reduction. Instead of bit
	 * reflecting our data (which is expensive to do), we bit reflect our
	 * constants and our algorithm, which means the intermediate data in
	 * our vector registers goes from 0-63 instead of 63-0. We can reflect
	 * the algorithm because we don't carry in mod 2 arithmetic.
	 */
	vand	v1,v0,mask_32bit	/* bottom 32 bits of a */
	VPMSUMD(v1,v1,const1)		/* ma */
	vand	v1,v1,mask_32bit	/* bottom 32bits of ma */
	VPMSUMD(v1,v1,const2)		/* qn */
	vxor	v0,v0,v1		/* a - qn, subtraction is xor in GF(2) */

	/*
	 * Since we are bit reflected, the result (ie the low 32 bits) is in
	 * the high 32 bits. We just need to shift it left 4 bytes
	 * V0 [ 0 1 X 3 ]
	 * V0 [ 0 X 2 3 ]
	 */
	vsldoi	v0,v0,zeroes,4		/* shift result into top 64 bits of */
#endif

	/* Get it into r3 */
	MFVRD(r3, v0)

.Lout:
	subi	r6,r1,56+10*16
	subi	r7,r1,56+2*16

	lvx	v20,0,r6
	lvx	v21,off16,r6
	lvx	v22,off32,r6
	lvx	v23,off48,r6
	lvx	v24,off64,r6
	lvx	v25,off80,r6
	lvx	v26,off96,r6
	lvx	v27,off112,r6
	lvx	v28,0,r7
	lvx	v29,off16,r7

	ld	r31,-8(r1)
	ld	r30,-16(r1)
	ld	r29,-24(r1)
	ld	r28,-32(r1)
	ld	r27,-40(r1)
	ld	r26,-48(r1)
	ld	r25,-56(r1)

	blr

.Lfirst_warm_up_done:
	lvx	const1,0,r3
	addi	r3,r3,16

	VPMSUMD(v8,v16,const1)
	VPMSUMD(v9,v17,const1)
	VPMSUMD(v10,v18,const1)
	VPMSUMD(v11,v19,const1)
	VPMSUMD(v12,v20,const1)
	VPMSUMD(v13,v21,const1)
	VPMSUMD(v14,v22,const1)
	VPMSUMD(v15,v23,const1)

	b	.Lsecond_cool_down

.Lshort:
	cmpdi	r5,0
	beq	.Lzero

	addis	r3,r2,.short_constants@toc@ha
	addi	r3,r3,.short_constants@toc@l

	/* Calculate where in the constant table we need to start */
	subfic	r6,r5,256
	add	r3,r3,r6

	/* How many 16 byte chunks? */
	srdi	r7,r5,4
	mtctr	r7

	vxor	v19,v19,v19
	vxor	v20,v20,v20

	lvx	v0,0,r4
	lvx	v16,0,r3
	VPERM(v0,v0,v16,byteswap)
	vxor	v0,v0,v8	/* xor in initial value */
	VPMSUMW(v0,v0,v16)
	bdz	.Lv0

	lvx	v1,off16,r4
	lvx	v17,off16,r3
	VPERM(v1,v1,v17,byteswap)
	VPMSUMW(v1,v1,v17)
	bdz	.Lv1

	lvx	v2,off32,r4
	lvx	v16,off32,r3
	VPERM(v2,v2,v16,byteswap)
	VPMSUMW(v2,v2,v16)
	bdz	.Lv2

	lvx	v3,off48,r4
	lvx	v17,off48,r3
	VPERM(v3,v3,v17,byteswap)
	VPMSUMW(v3,v3,v17)
	bdz	.Lv3

	lvx	v4,off64,r4
	lvx	v16,off64,r3
	VPERM(v4,v4,v16,byteswap)
	VPMSUMW(v4,v4,v16)
	bdz	.Lv4

	lvx	v5,off80,r4
	lvx	v17,off80,r3
	VPERM(v5,v5,v17,byteswap)
	VPMSUMW(v5,v5,v17)
	bdz	.Lv5

	lvx	v6,off96,r4
	lvx	v16,off96,r3
	VPERM(v6,v6,v16,byteswap)
	VPMSUMW(v6,v6,v16)
	bdz	.Lv6

	lvx	v7,off112,r4
	lvx	v17,off112,r3
	VPERM(v7,v7,v17,byteswap)
	VPMSUMW(v7,v7,v17)
	bdz	.Lv7

	addi	r3,r3,128
	addi	r4,r4,128

	lvx	v8,0,r4
	lvx	v16,0,r3
	VPERM(v8,v8,v16,byteswap)
	VPMSUMW(v8,v8,v16)
	bdz	.Lv8

	lvx	v9,off16,r4
	lvx	v17,off16,r3
	VPERM(v9,v9,v17,byteswap)
	VPMSUMW(v9,v9,v17)
	bdz	.Lv9

	lvx	v10,off32,r4
	lvx	v16,off32,r3
	VPERM(v10,v10,v16,byteswap)
	VPMSUMW(v10,v10,v16)
	bdz	.Lv10

	lvx	v11,off48,r4
	lvx	v17,off48,r3
	VPERM(v11,v11,v17,byteswap)
	VPMSUMW(v11,v11,v17)
	bdz	.Lv11

	lvx	v12,off64,r4
	lvx	v16,off64,r3
	VPERM(v12,v12,v16,byteswap)
	VPMSUMW(v12,v12,v16)
	bdz	.Lv12

	lvx	v13,off80,r4
	lvx	v17,off80,r3
	VPERM(v13,v13,v17,byteswap)
	VPMSUMW(v13,v13,v17)
	bdz	.Lv13

	lvx	v14,off96,r4
	lvx	v16,off96,r3
	VPERM(v14,v14,v16,byteswap)
	VPMSUMW(v14,v14,v16)
	bdz	.Lv14

	lvx	v15,off112,r4
	lvx	v17,off112,r3
	VPERM(v15,v15,v17,byteswap)
	VPMSUMW(v15,v15,v17)

.Lv15:	vxor	v19,v19,v15
.Lv14:	vxor	v20,v20,v14
.Lv13:	vxor	v19,v19,v13
.Lv12:	vxor	v20,v20,v12
.Lv11:	vxor	v19,v19,v11
.Lv10:	vxor	v20,v20,v10
.Lv9:	vxor	v19,v19,v9
.Lv8:	vxor	v20,v20,v8
.Lv7:	vxor	v19,v19,v7
.Lv6:	vxor	v20,v20,v6
.Lv5:	vxor	v19,v19,v5
.Lv4:	vxor	v20,v20,v4
.Lv3:	vxor	v19,v19,v3
.Lv2:	vxor	v20,v20,v2
.Lv1:	vxor	v19,v19,v1
.Lv0:	vxor	v20,v20,v0

	vxor	v0,v19,v20

	b	.Lbarrett_reduction

.Lzero:
	mr	r3,r10
	b	.Lout

FUNC_END(CRC32_FUNCTION_ASM)
Commit	Line	Data
7c673cae FG	1	/*
	2	* Calculate the checksum of data that is 16 byte aligned and a multiple of
	3	* 16 bytes.
	4	*
	5	* The first step is to reduce it to 1024 bits. We do this in 8 parallel
	6	* chunks in order to mask the latency of the vpmsum instructions. If we
	7	* have more than 32 kB of data to checksum we repeat this step multiple
	8	* times, passing in the previous 1024 bits.
	9	*
	10	* The next step is to reduce the 1024 bits to 64 bits. This step adds
	11	* 32 bits of 0s to the end - this matches what a CRC does. We just
	12	* calculate constants that land the data in this 32 bits.
	13	*
	14	* We then use fixed point Barrett reduction to compute a mod n over GF(2)
	15	* for n = CRC using POWER8 instructions. We use x = 32.
	16	*
	17	* http://en.wikipedia.org/wiki/Barrett_reduction
	18	*
	19	* Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
7c673cae FG	20	*
7c673cae FG	21	* This program is free software; you can redistribute it and/or
20effc67 TL	22	* modify it under the terms of either:
	23	*
	24	* a) the GNU General Public License as published by the Free Software
	25	* Foundation; either version 2 of the License, or (at your option)
	26	* any later version, or
	27	* b) the Apache License, Version 2.0
7c673cae	28	*/
20effc67 TL	29
	30	#if defined (__clang__)
	31	#ifndef __ALTIVEC__
	32	#define __ALTIVEC__
	33	#endif
	34	#include "ppc-asm.h"
	35	#else
7c673cae	36	#include <ppc-asm.h>
20effc67 TL	37	#endif
20effc67 TL	38	#include "ppc-opcode.h"
7c673cae FG	39
	40	#undef toc
	41
	42	#ifndef r1
	43	#define r1 1
	44	#endif
	45
	46	#ifndef r2
	47	#define r2 2
	48	#endif
	49
	50	.section .rodata
	51	.balign 16
	52
	53	.byteswap_constant:
	54	/* byte reverse permute constant */
	55	.octa 0x0F0E0D0C0B0A09080706050403020100
	56
20effc67 TL	57	#ifdef CRC32_CONSTANTS_HEADER
	58	#include CRC32_CONSTANTS_HEADER
	59	#else
7c673cae	60	#include "crc32c_ppc_constants.h"
20effc67	61	#endif
7c673cae FG	62
	63	.text
	64
	65	#if defined(__BIG_ENDIAN__) && defined(REFLECT)
	66	#define BYTESWAP_DATA
	67	#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
	68	#define BYTESWAP_DATA
	69	#else
	70	#undef BYTESWAP_DATA
	71	#endif
	72
	73	#define off16 r25
	74	#define off32 r26
	75	#define off48 r27
	76	#define off64 r28
	77	#define off80 r29
	78	#define off96 r30
	79	#define off112 r31
	80
	81	#define const1 v24
	82	#define const2 v25
	83
	84	#define byteswap v26
	85	#define mask_32bit v27
	86	#define mask_64bit v28
	87	#define zeroes v29
	88
	89	#ifdef BYTESWAP_DATA
	90	#define VPERM(A, B, C, D) vperm A, B, C, D
	91	#else
	92	#define VPERM(A, B, C, D)
	93	#endif
	94
20effc67 TL	95	#ifndef CRC32_FUNCTION_ASM
	96	#define CRC32_FUNCTION_ASM __crc32_vpmsum
	97	#endif
	98
7c673cae	99	/* unsigned int __crc32_vpmsum(unsigned int crc, void p, unsigned long len) /
20effc67	100	FUNC_START(CRC32_FUNCTION_ASM)
7c673cae FG	101	std r31,-8(r1)
	102	std r30,-16(r1)
	103	std r29,-24(r1)
	104	std r28,-32(r1)
	105	std r27,-40(r1)
	106	std r26,-48(r1)
	107	std r25,-56(r1)
	108
	109	li off16,16
	110	li off32,32
	111	li off48,48
	112	li off64,64
	113	li off80,80
	114	li off96,96
	115	li off112,112
	116	li r0,0
	117
	118	/* Enough room for saving 10 non volatile VMX registers */
	119	subi r6,r1,56+10*16
	120	subi r7,r1,56+2*16
	121
	122	stvx v20,0,r6
	123	stvx v21,off16,r6
	124	stvx v22,off32,r6
	125	stvx v23,off48,r6
	126	stvx v24,off64,r6
	127	stvx v25,off80,r6
	128	stvx v26,off96,r6
	129	stvx v27,off112,r6
	130	stvx v28,0,r7
	131	stvx v29,off16,r7
	132
	133	mr r10,r3
	134
	135	vxor zeroes,zeroes,zeroes
	136	vspltisw v0,-1
	137
	138	vsldoi mask_32bit,zeroes,v0,4
	139	vsldoi mask_64bit,zeroes,v0,8
	140
	141	/* Get the initial value into v8 */
	142	vxor v8,v8,v8
	143	MTVRD(v8, r3)
	144	#ifdef REFLECT
	145	vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
	146	#else
	147	vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
	148	#endif
	149
	150	#ifdef BYTESWAP_DATA
	151	addis r3,r2,.byteswap_constant@toc@ha
	152	addi r3,r3,.byteswap_constant@toc@l
	153
	154	lvx byteswap,0,r3
	155	addi r3,r3,16
	156	#endif
	157
	158	cmpdi r5,256
	159	blt .Lshort
	160
	161	rldicr r6,r5,0,56
	162
	163	/* Checksum in blocks of MAX_SIZE */
	164	1: lis r7,MAX_SIZE@h
165	ori r7,r7,MAX_SIZE@l
166	mr r9,r7
167	cmpd r6,r7
168	bgt 2f
169	mr r7,r6
170	2: subf r6,r7,r6
171
172	/* our main loop does 128 bytes at a time */
173	srdi r7,r7,7
174
175	/*
176	* Work out the offset into the constants table to start at. Each
177	* constant is 16 bytes, and it is used against 128 bytes of input
178	* data - 128 / 16 = 8
179	*/
180	sldi r8,r7,4
181	srdi r9,r9,3
182	subf r8,r8,r9
183
184	/* We reduce our final 128 bytes in a separate step */
185	addi r7,r7,-1
186	mtctr r7
187
188	addis r3,r2,.constants@toc@ha
189	addi r3,r3,.constants@toc@l
190
191	/* Find the start of our constants */
192	add r3,r3,r8
193
194	/* zero v0-v7 which will contain our checksums */
195	vxor v0,v0,v0
196	vxor v1,v1,v1
197	vxor v2,v2,v2
198	vxor v3,v3,v3
199	vxor v4,v4,v4
200	vxor v5,v5,v5
201	vxor v6,v6,v6
202	vxor v7,v7,v7
203
204	lvx const1,0,r3
205
206	/*
207	* If we are looping back to consume more data we use the values
208	* already in v16-v23.
209	*/
210	cmpdi r0,1
211	beq 2f
212
213	/* First warm up pass */
214	lvx v16,0,r4
215	lvx v17,off16,r4
216	VPERM(v16,v16,v16,byteswap)
217	VPERM(v17,v17,v17,byteswap)
218	lvx v18,off32,r4
219	lvx v19,off48,r4
220	VPERM(v18,v18,v18,byteswap)
221	VPERM(v19,v19,v19,byteswap)
222	lvx v20,off64,r4
223	lvx v21,off80,r4
224	VPERM(v20,v20,v20,byteswap)
225	VPERM(v21,v21,v21,byteswap)
226	lvx v22,off96,r4
227	lvx v23,off112,r4
228	VPERM(v22,v22,v22,byteswap)
229	VPERM(v23,v23,v23,byteswap)
230	addi r4,r4,8*16
231
232	/* xor in initial value */
233	vxor v16,v16,v8
234
235	2: bdz .Lfirst_warm_up_done
236
237	addi r3,r3,16
238	lvx const2,0,r3
239
240	/* Second warm up pass */
241	VPMSUMD(v8,v16,const1)
242	lvx v16,0,r4
243	VPERM(v16,v16,v16,byteswap)
244	ori r2,r2,0
245
246	VPMSUMD(v9,v17,const1)
247	lvx v17,off16,r4
248	VPERM(v17,v17,v17,byteswap)
249	ori r2,r2,0
250
251	VPMSUMD(v10,v18,const1)
252	lvx v18,off32,r4
253	VPERM(v18,v18,v18,byteswap)
254	ori r2,r2,0
255
256	VPMSUMD(v11,v19,const1)
257	lvx v19,off48,r4
258	VPERM(v19,v19,v19,byteswap)
259	ori r2,r2,0
260
261	VPMSUMD(v12,v20,const1)
262	lvx v20,off64,r4
263	VPERM(v20,v20,v20,byteswap)
264	ori r2,r2,0
265
266	VPMSUMD(v13,v21,const1)
267	lvx v21,off80,r4
268	VPERM(v21,v21,v21,byteswap)
269	ori r2,r2,0
270
271	VPMSUMD(v14,v22,const1)
272	lvx v22,off96,r4
273	VPERM(v22,v22,v22,byteswap)
274	ori r2,r2,0
275
276	VPMSUMD(v15,v23,const1)
277	lvx v23,off112,r4
278	VPERM(v23,v23,v23,byteswap)
279
280	addi r4,r4,8*16
281
282	bdz .Lfirst_cool_down
283
284	/*
285	* main loop. We modulo schedule it such that it takes three iterations
286	* to complete - first iteration load, second iteration vpmsum, third
287	* iteration xor.
288	*/
289	.balign 16
290	4: lvx const1,0,r3
291	addi r3,r3,16
292	ori r2,r2,0
293
294	vxor v0,v0,v8
295	VPMSUMD(v8,v16,const2)
296	lvx v16,0,r4
297	VPERM(v16,v16,v16,byteswap)
298	ori r2,r2,0
299
300	vxor v1,v1,v9
301	VPMSUMD(v9,v17,const2)
302	lvx v17,off16,r4
303	VPERM(v17,v17,v17,byteswap)
304	ori r2,r2,0
305
306	vxor v2,v2,v10
307	VPMSUMD(v10,v18,const2)
308	lvx v18,off32,r4
309	VPERM(v18,v18,v18,byteswap)
310	ori r2,r2,0
311
312	vxor v3,v3,v11
313	VPMSUMD(v11,v19,const2)
314	lvx v19,off48,r4
315	VPERM(v19,v19,v19,byteswap)
316	lvx const2,0,r3
317	ori r2,r2,0
318
319	vxor v4,v4,v12
320	VPMSUMD(v12,v20,const1)
321	lvx v20,off64,r4
322	VPERM(v20,v20,v20,byteswap)
323	ori r2,r2,0
324
325	vxor v5,v5,v13
326	VPMSUMD(v13,v21,const1)
327	lvx v21,off80,r4
328	VPERM(v21,v21,v21,byteswap)
329	ori r2,r2,0
330
331	vxor v6,v6,v14
332	VPMSUMD(v14,v22,const1)
333	lvx v22,off96,r4
334	VPERM(v22,v22,v22,byteswap)
335	ori r2,r2,0
336
337	vxor v7,v7,v15
338	VPMSUMD(v15,v23,const1)
339	lvx v23,off112,r4
340	VPERM(v23,v23,v23,byteswap)
341
342	addi r4,r4,8*16
343
344	bdnz 4b
345
346	.Lfirst_cool_down:
347	/* First cool down pass */
348	lvx const1,0,r3
349	addi r3,r3,16
350
351	vxor v0,v0,v8
352	VPMSUMD(v8,v16,const1)
353	ori r2,r2,0
354
355	vxor v1,v1,v9
356	VPMSUMD(v9,v17,const1)
357	ori r2,r2,0
358
359	vxor v2,v2,v10
360	VPMSUMD(v10,v18,const1)
361	ori r2,r2,0
362
363	vxor v3,v3,v11
364	VPMSUMD(v11,v19,const1)
365	ori r2,r2,0
366
367	vxor v4,v4,v12
368	VPMSUMD(v12,v20,const1)
369	ori r2,r2,0
370
371	vxor v5,v5,v13
372	VPMSUMD(v13,v21,const1)
373	ori r2,r2,0
374
375	vxor v6,v6,v14
376	VPMSUMD(v14,v22,const1)
377	ori r2,r2,0
378
379	vxor v7,v7,v15
380	VPMSUMD(v15,v23,const1)
381	ori r2,r2,0
382
383	.Lsecond_cool_down:
384	/* Second cool down pass */
385	vxor v0,v0,v8
386	vxor v1,v1,v9
387	vxor v2,v2,v10
388	vxor v3,v3,v11
389	vxor v4,v4,v12
390	vxor v5,v5,v13
391	vxor v6,v6,v14
392	vxor v7,v7,v15
393
394	#ifdef REFLECT
395	/*
396	* vpmsumd produces a 96 bit result in the least significant bits
397	* of the register. Since we are bit reflected we have to shift it
398	* left 32 bits so it occupies the least significant bits in the
399	* bit reflected domain.
400	*/
401	vsldoi v0,v0,zeroes,4
402	vsldoi v1,v1,zeroes,4
403	vsldoi v2,v2,zeroes,4
404	vsldoi v3,v3,zeroes,4
405	vsldoi v4,v4,zeroes,4
406	vsldoi v5,v5,zeroes,4
407	vsldoi v6,v6,zeroes,4
408	vsldoi v7,v7,zeroes,4
409	#endif
410
411	/* xor with last 1024 bits */
412	lvx v8,0,r4
413	lvx v9,off16,r4
414	VPERM(v8,v8,v8,byteswap)
415	VPERM(v9,v9,v9,byteswap)
416	lvx v10,off32,r4
417	lvx v11,off48,r4
418	VPERM(v10,v10,v10,byteswap)
419	VPERM(v11,v11,v11,byteswap)
420	lvx v12,off64,r4
421	lvx v13,off80,r4
422	VPERM(v12,v12,v12,byteswap)
423	VPERM(v13,v13,v13,byteswap)
424	lvx v14,off96,r4
425	lvx v15,off112,r4
426	VPERM(v14,v14,v14,byteswap)
427	VPERM(v15,v15,v15,byteswap)
428
429	addi r4,r4,8*16
430
431	vxor v16,v0,v8
432	vxor v17,v1,v9
433	vxor v18,v2,v10
434	vxor v19,v3,v11
435	vxor v20,v4,v12
436	vxor v21,v5,v13
437	vxor v22,v6,v14
438	vxor v23,v7,v15
439
440	li r0,1
441	cmpdi r6,0
442	addi r6,r6,128
443	bne 1b
444
445	/* Work out how many bytes we have left */
446	andi. r5,r5,127
447
448	/* Calculate where in the constant table we need to start */
449	subfic r6,r5,128
450	add r3,r3,r6
451
452	/* How many 16 byte chunks are in the tail */
453	srdi r7,r5,4
454	mtctr r7
455
456	/*
457	* Reduce the previously calculated 1024 bits to 64 bits, shifting
458	* 32 bits to include the trailing 32 bits of zeros
459	*/
460	lvx v0,0,r3
461	lvx v1,off16,r3
462	lvx v2,off32,r3
463	lvx v3,off48,r3
464	lvx v4,off64,r3
465	lvx v5,off80,r3
466	lvx v6,off96,r3
467	lvx v7,off112,r3
468	addi r3,r3,8*16
469
470	VPMSUMW(v0,v16,v0)
471	VPMSUMW(v1,v17,v1)
472	VPMSUMW(v2,v18,v2)
473	VPMSUMW(v3,v19,v3)
474	VPMSUMW(v4,v20,v4)
475	VPMSUMW(v5,v21,v5)
476	VPMSUMW(v6,v22,v6)
477	VPMSUMW(v7,v23,v7)
478
479	/* Now reduce the tail (0 - 112 bytes) */
480	cmpdi r7,0
481	beq 1f
482
483	lvx v16,0,r4
484	lvx v17,0,r3
485	VPERM(v16,v16,v16,byteswap)
486	VPMSUMW(v16,v16,v17)
487	vxor v0,v0,v16
488	bdz 1f
489
490	lvx v16,off16,r4
491	lvx v17,off16,r3
492	VPERM(v16,v16,v16,byteswap)
493	VPMSUMW(v16,v16,v17)
494	vxor v0,v0,v16
495	bdz 1f
496
497	lvx v16,off32,r4
498	lvx v17,off32,r3
499	VPERM(v16,v16,v16,byteswap)
500	VPMSUMW(v16,v16,v17)
501	vxor v0,v0,v16
502	bdz 1f
503
504	lvx v16,off48,r4
505	lvx v17,off48,r3
506	VPERM(v16,v16,v16,byteswap)
507	VPMSUMW(v16,v16,v17)
508	vxor v0,v0,v16
509	bdz 1f
510
511	lvx v16,off64,r4
512	lvx v17,off64,r3
513	VPERM(v16,v16,v16,byteswap)
514	VPMSUMW(v16,v16,v17)
515	vxor v0,v0,v16
516	bdz 1f
517
518	lvx v16,off80,r4
519	lvx v17,off80,r3
520	VPERM(v16,v16,v16,byteswap)
521	VPMSUMW(v16,v16,v17)
522	vxor v0,v0,v16
523	bdz 1f
524
525	lvx v16,off96,r4
526	lvx v17,off96,r3
527	VPERM(v16,v16,v16,byteswap)
528	VPMSUMW(v16,v16,v17)
529	vxor v0,v0,v16
530
531	/* Now xor all the parallel chunks together */
532	1: vxor v0,v0,v1
533	vxor v2,v2,v3
534	vxor v4,v4,v5
535	vxor v6,v6,v7
536
537	vxor v0,v0,v2
538	vxor v4,v4,v6
539
540	vxor v0,v0,v4
541
542	.Lbarrett_reduction:
543	/* Barrett constants */
544	addis r3,r2,.barrett_constants@toc@ha
545	addi r3,r3,.barrett_constants@toc@l
546
547	lvx const1,0,r3
548	lvx const2,off16,r3
549
550	vsldoi v1,v0,v0,8
551	vxor v0,v0,v1 /* xor two 64 bit results together */
552
553	#ifdef REFLECT
554	/* shift left one bit */
555	vspltisb v1,1
556	vsl v0,v0,v1
557	#endif
558
559	vand v0,v0,mask_64bit
560
561	#ifndef REFLECT
562	/*
563	* Now for the Barrett reduction algorithm. The idea is to calculate q,
564	* the multiple of our polynomial that we need to subtract. By
565	* doing the computation 2x bits higher (ie 64 bits) and shifting the
566	* result back down 2x bits, we round down to the nearest multiple.
567	*/
568	VPMSUMD(v1,v0,const1) /* ma */
569	vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
570	VPMSUMD(v1,v1,const2) /* qn */
571	vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
572
573	/*
574	* Get the result into r3. We need to shift it left 8 bytes:
575	* V0 [ 0 1 2 X ]
576	* V0 [ 0 X 2 3 ]
577	*/
578	vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
579	#else
580	/*
581	* The reflected version of Barrett reduction. Instead of bit
582	* reflecting our data (which is expensive to do), we bit reflect our
583	* constants and our algorithm, which means the intermediate data in
584	* our vector registers goes from 0-63 instead of 63-0. We can reflect
585	* the algorithm because we don't carry in mod 2 arithmetic.
586	*/
587	vand v1,v0,mask_32bit /* bottom 32 bits of a */
588	VPMSUMD(v1,v1,const1) /* ma */
589	vand v1,v1,mask_32bit /* bottom 32bits of ma */
590	VPMSUMD(v1,v1,const2) /* qn */
591	vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
592
593	/*
594	* Since we are bit reflected, the result (ie the low 32 bits) is in
595	* the high 32 bits. We just need to shift it left 4 bytes
596	* V0 [ 0 1 X 3 ]
597	* V0 [ 0 X 2 3 ]
598	*/
599	vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
600	#endif
601
602	/* Get it into r3 */
603	MFVRD(r3, v0)
604
605	.Lout:
606	subi r6,r1,56+10*16
607	subi r7,r1,56+2*16
608
609	lvx v20,0,r6
610	lvx v21,off16,r6
611	lvx v22,off32,r6
612	lvx v23,off48,r6
613	lvx v24,off64,r6
614	lvx v25,off80,r6
615	lvx v26,off96,r6
616	lvx v27,off112,r6
617	lvx v28,0,r7
618	lvx v29,off16,r7
619
620	ld r31,-8(r1)
621	ld r30,-16(r1)
622	ld r29,-24(r1)
623	ld r28,-32(r1)
624	ld r27,-40(r1)
625	ld r26,-48(r1)
626	ld r25,-56(r1)
627
628	blr
629
630	.Lfirst_warm_up_done:
631	lvx const1,0,r3
632	addi r3,r3,16
633
634	VPMSUMD(v8,v16,const1)
635	VPMSUMD(v9,v17,const1)
636	VPMSUMD(v10,v18,const1)
637	VPMSUMD(v11,v19,const1)
638	VPMSUMD(v12,v20,const1)
639	VPMSUMD(v13,v21,const1)
640	VPMSUMD(v14,v22,const1)
641	VPMSUMD(v15,v23,const1)
642
643	b .Lsecond_cool_down
644
645	.Lshort:
646	cmpdi r5,0
647	beq .Lzero
648
649	addis r3,r2,.short_constants@toc@ha
650	addi r3,r3,.short_constants@toc@l
651
652	/* Calculate where in the constant table we need to start */
653	subfic r6,r5,256
654	add r3,r3,r6
655
656	/* How many 16 byte chunks? */
657	srdi r7,r5,4
658	mtctr r7
659
660	vxor v19,v19,v19
661	vxor v20,v20,v20
662
663	lvx v0,0,r4
664	lvx v16,0,r3
665	VPERM(v0,v0,v16,byteswap)
666	vxor v0,v0,v8 /* xor in initial value */
667	VPMSUMW(v0,v0,v16)
668	bdz .Lv0
669
670	lvx v1,off16,r4
671	lvx v17,off16,r3
672	VPERM(v1,v1,v17,byteswap)
673	VPMSUMW(v1,v1,v17)
674	bdz .Lv1
675
676	lvx v2,off32,r4
677	lvx v16,off32,r3
678	VPERM(v2,v2,v16,byteswap)
679	VPMSUMW(v2,v2,v16)
680	bdz .Lv2
681
682	lvx v3,off48,r4
683	lvx v17,off48,r3
684	VPERM(v3,v3,v17,byteswap)
685	VPMSUMW(v3,v3,v17)
686	bdz .Lv3
687
688	lvx v4,off64,r4
689	lvx v16,off64,r3
690	VPERM(v4,v4,v16,byteswap)
691	VPMSUMW(v4,v4,v16)
692	bdz .Lv4
693
694	lvx v5,off80,r4
695	lvx v17,off80,r3
696	VPERM(v5,v5,v17,byteswap)
697	VPMSUMW(v5,v5,v17)
698	bdz .Lv5
699
700	lvx v6,off96,r4
701	lvx v16,off96,r3
702	VPERM(v6,v6,v16,byteswap)
703	VPMSUMW(v6,v6,v16)
704	bdz .Lv6
705
706	lvx v7,off112,r4
707	lvx v17,off112,r3
708	VPERM(v7,v7,v17,byteswap)
709	VPMSUMW(v7,v7,v17)
710	bdz .Lv7
711
712	addi r3,r3,128
713	addi r4,r4,128
714
715	lvx v8,0,r4
716	lvx v16,0,r3
717	VPERM(v8,v8,v16,byteswap)
718	VPMSUMW(v8,v8,v16)
719	bdz .Lv8
720
721	lvx v9,off16,r4
722	lvx v17,off16,r3
723	VPERM(v9,v9,v17,byteswap)
724	VPMSUMW(v9,v9,v17)
725	bdz .Lv9
726
727	lvx v10,off32,r4
728	lvx v16,off32,r3
729	VPERM(v10,v10,v16,byteswap)
730	VPMSUMW(v10,v10,v16)
731	bdz .Lv10
732
733	lvx v11,off48,r4
734	lvx v17,off48,r3
735	VPERM(v11,v11,v17,byteswap)
736	VPMSUMW(v11,v11,v17)
737	bdz .Lv11
738
739	lvx v12,off64,r4
740	lvx v16,off64,r3
741	VPERM(v12,v12,v16,byteswap)
742	VPMSUMW(v12,v12,v16)
743	bdz .Lv12
744
745	lvx v13,off80,r4
746	lvx v17,off80,r3
747	VPERM(v13,v13,v17,byteswap)
748	VPMSUMW(v13,v13,v17)
749	bdz .Lv13
750
751	lvx v14,off96,r4
752	lvx v16,off96,r3
753	VPERM(v14,v14,v16,byteswap)
754	VPMSUMW(v14,v14,v16)
755	bdz .Lv14
756
757	lvx v15,off112,r4
758	lvx v17,off112,r3
759	VPERM(v15,v15,v17,byteswap)
760	VPMSUMW(v15,v15,v17)
761
762	.Lv15: vxor v19,v19,v15
763	.Lv14: vxor v20,v20,v14
764	.Lv13: vxor v19,v19,v13
765	.Lv12: vxor v20,v20,v12
766	.Lv11: vxor v19,v19,v11
767	.Lv10: vxor v20,v20,v10
768	.Lv9: vxor v19,v19,v9
769	.Lv8: vxor v20,v20,v8
770	.Lv7: vxor v19,v19,v7
771	.Lv6: vxor v20,v20,v6
772	.Lv5: vxor v19,v19,v5
773	.Lv4: vxor v20,v20,v4
774	.Lv3: vxor v19,v19,v3
775	.Lv2: vxor v20,v20,v2
776	.Lv1: vxor v19,v19,v1
777	.Lv0: vxor v20,v20,v0
778
779	vxor v0,v19,v20
780
781	b .Lbarrett_reduction
782
783	.Lzero:
784	mr r3,r10
785	b .Lout
786
20effc67	787	FUNC_END(CRC32_FUNCTION_ASM)