]> git.proxmox.com Git - ceph.git/blame - ceph/src/common/crc32c_ppc_asm.S
import ceph quincy 17.2.6
[ceph.git] / ceph / src / common / crc32c_ppc_asm.S
CommitLineData
7c673cae
FG
1/*
2 * Calculate the checksum of data that is 16 byte aligned and a multiple of
3 * 16 bytes.
4 *
5 * The first step is to reduce it to 1024 bits. We do this in 8 parallel
6 * chunks in order to mask the latency of the vpmsum instructions. If we
7 * have more than 32 kB of data to checksum we repeat this step multiple
8 * times, passing in the previous 1024 bits.
9 *
10 * The next step is to reduce the 1024 bits to 64 bits. This step adds
11 * 32 bits of 0s to the end - this matches what a CRC does. We just
12 * calculate constants that land the data in this 32 bits.
13 *
14 * We then use fixed point Barrett reduction to compute a mod n over GF(2)
15 * for n = CRC using POWER8 instructions. We use x = 32.
16 *
17 * http://en.wikipedia.org/wiki/Barrett_reduction
18 *
19 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
7c673cae
FG
20 *
21 * This program is free software; you can redistribute it and/or
20effc67
TL
22 * modify it under the terms of either:
23 *
24 * a) the GNU General Public License as published by the Free Software
25 * Foundation; either version 2 of the License, or (at your option)
26 * any later version, or
27 * b) the Apache License, Version 2.0
7c673cae 28 */
20effc67
TL
29
30#if defined (__clang__)
31#ifndef __ALTIVEC__
32#define __ALTIVEC__
33#endif
34#include "ppc-asm.h"
35#else
7c673cae 36#include <ppc-asm.h>
20effc67
TL
37#endif
38#include "ppc-opcode.h"
7c673cae
FG
39
40#undef toc
41
42#ifndef r1
43#define r1 1
44#endif
45
46#ifndef r2
47#define r2 2
48#endif
49
50 .section .rodata
51.balign 16
52
53.byteswap_constant:
54 /* byte reverse permute constant */
55 .octa 0x0F0E0D0C0B0A09080706050403020100
56
20effc67
TL
57#ifdef CRC32_CONSTANTS_HEADER
58#include CRC32_CONSTANTS_HEADER
59#else
7c673cae 60#include "crc32c_ppc_constants.h"
20effc67 61#endif
7c673cae
FG
62
63 .text
64
65#if defined(__BIG_ENDIAN__) && defined(REFLECT)
66#define BYTESWAP_DATA
67#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
68#define BYTESWAP_DATA
69#else
70#undef BYTESWAP_DATA
71#endif
72
73#define off16 r25
74#define off32 r26
75#define off48 r27
76#define off64 r28
77#define off80 r29
78#define off96 r30
79#define off112 r31
80
81#define const1 v24
82#define const2 v25
83
84#define byteswap v26
85#define mask_32bit v27
86#define mask_64bit v28
87#define zeroes v29
88
89#ifdef BYTESWAP_DATA
90#define VPERM(A, B, C, D) vperm A, B, C, D
91#else
92#define VPERM(A, B, C, D)
93#endif
94
20effc67
TL
95#ifndef CRC32_FUNCTION_ASM
96#define CRC32_FUNCTION_ASM __crc32_vpmsum
97#endif
98
7c673cae 99/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */
20effc67 100FUNC_START(CRC32_FUNCTION_ASM)
7c673cae
FG
101 std r31,-8(r1)
102 std r30,-16(r1)
103 std r29,-24(r1)
104 std r28,-32(r1)
105 std r27,-40(r1)
106 std r26,-48(r1)
107 std r25,-56(r1)
108
109 li off16,16
110 li off32,32
111 li off48,48
112 li off64,64
113 li off80,80
114 li off96,96
115 li off112,112
116 li r0,0
117
118 /* Enough room for saving 10 non volatile VMX registers */
119 subi r6,r1,56+10*16
120 subi r7,r1,56+2*16
121
122 stvx v20,0,r6
123 stvx v21,off16,r6
124 stvx v22,off32,r6
125 stvx v23,off48,r6
126 stvx v24,off64,r6
127 stvx v25,off80,r6
128 stvx v26,off96,r6
129 stvx v27,off112,r6
130 stvx v28,0,r7
131 stvx v29,off16,r7
132
133 mr r10,r3
134
135 vxor zeroes,zeroes,zeroes
136 vspltisw v0,-1
137
138 vsldoi mask_32bit,zeroes,v0,4
139 vsldoi mask_64bit,zeroes,v0,8
140
141 /* Get the initial value into v8 */
142 vxor v8,v8,v8
143 MTVRD(v8, r3)
144#ifdef REFLECT
145 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */
146#else
147 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */
148#endif
149
150#ifdef BYTESWAP_DATA
151 addis r3,r2,.byteswap_constant@toc@ha
152 addi r3,r3,.byteswap_constant@toc@l
153
154 lvx byteswap,0,r3
155 addi r3,r3,16
156#endif
157
158 cmpdi r5,256
159 blt .Lshort
160
161 rldicr r6,r5,0,56
162
163 /* Checksum in blocks of MAX_SIZE */
1641: lis r7,MAX_SIZE@h
165 ori r7,r7,MAX_SIZE@l
166 mr r9,r7
167 cmpd r6,r7
168 bgt 2f
169 mr r7,r6
1702: subf r6,r7,r6
171
172 /* our main loop does 128 bytes at a time */
173 srdi r7,r7,7
174
175 /*
176 * Work out the offset into the constants table to start at. Each
177 * constant is 16 bytes, and it is used against 128 bytes of input
178 * data - 128 / 16 = 8
179 */
180 sldi r8,r7,4
181 srdi r9,r9,3
182 subf r8,r8,r9
183
184 /* We reduce our final 128 bytes in a separate step */
185 addi r7,r7,-1
186 mtctr r7
187
188 addis r3,r2,.constants@toc@ha
189 addi r3,r3,.constants@toc@l
190
191 /* Find the start of our constants */
192 add r3,r3,r8
193
194 /* zero v0-v7 which will contain our checksums */
195 vxor v0,v0,v0
196 vxor v1,v1,v1
197 vxor v2,v2,v2
198 vxor v3,v3,v3
199 vxor v4,v4,v4
200 vxor v5,v5,v5
201 vxor v6,v6,v6
202 vxor v7,v7,v7
203
204 lvx const1,0,r3
205
206 /*
207 * If we are looping back to consume more data we use the values
208 * already in v16-v23.
209 */
210 cmpdi r0,1
211 beq 2f
212
213 /* First warm up pass */
214 lvx v16,0,r4
215 lvx v17,off16,r4
216 VPERM(v16,v16,v16,byteswap)
217 VPERM(v17,v17,v17,byteswap)
218 lvx v18,off32,r4
219 lvx v19,off48,r4
220 VPERM(v18,v18,v18,byteswap)
221 VPERM(v19,v19,v19,byteswap)
222 lvx v20,off64,r4
223 lvx v21,off80,r4
224 VPERM(v20,v20,v20,byteswap)
225 VPERM(v21,v21,v21,byteswap)
226 lvx v22,off96,r4
227 lvx v23,off112,r4
228 VPERM(v22,v22,v22,byteswap)
229 VPERM(v23,v23,v23,byteswap)
230 addi r4,r4,8*16
231
232 /* xor in initial value */
233 vxor v16,v16,v8
234
2352: bdz .Lfirst_warm_up_done
236
237 addi r3,r3,16
238 lvx const2,0,r3
239
240 /* Second warm up pass */
241 VPMSUMD(v8,v16,const1)
242 lvx v16,0,r4
243 VPERM(v16,v16,v16,byteswap)
244 ori r2,r2,0
245
246 VPMSUMD(v9,v17,const1)
247 lvx v17,off16,r4
248 VPERM(v17,v17,v17,byteswap)
249 ori r2,r2,0
250
251 VPMSUMD(v10,v18,const1)
252 lvx v18,off32,r4
253 VPERM(v18,v18,v18,byteswap)
254 ori r2,r2,0
255
256 VPMSUMD(v11,v19,const1)
257 lvx v19,off48,r4
258 VPERM(v19,v19,v19,byteswap)
259 ori r2,r2,0
260
261 VPMSUMD(v12,v20,const1)
262 lvx v20,off64,r4
263 VPERM(v20,v20,v20,byteswap)
264 ori r2,r2,0
265
266 VPMSUMD(v13,v21,const1)
267 lvx v21,off80,r4
268 VPERM(v21,v21,v21,byteswap)
269 ori r2,r2,0
270
271 VPMSUMD(v14,v22,const1)
272 lvx v22,off96,r4
273 VPERM(v22,v22,v22,byteswap)
274 ori r2,r2,0
275
276 VPMSUMD(v15,v23,const1)
277 lvx v23,off112,r4
278 VPERM(v23,v23,v23,byteswap)
279
280 addi r4,r4,8*16
281
282 bdz .Lfirst_cool_down
283
284 /*
285 * main loop. We modulo schedule it such that it takes three iterations
286 * to complete - first iteration load, second iteration vpmsum, third
287 * iteration xor.
288 */
289 .balign 16
2904: lvx const1,0,r3
291 addi r3,r3,16
292 ori r2,r2,0
293
294 vxor v0,v0,v8
295 VPMSUMD(v8,v16,const2)
296 lvx v16,0,r4
297 VPERM(v16,v16,v16,byteswap)
298 ori r2,r2,0
299
300 vxor v1,v1,v9
301 VPMSUMD(v9,v17,const2)
302 lvx v17,off16,r4
303 VPERM(v17,v17,v17,byteswap)
304 ori r2,r2,0
305
306 vxor v2,v2,v10
307 VPMSUMD(v10,v18,const2)
308 lvx v18,off32,r4
309 VPERM(v18,v18,v18,byteswap)
310 ori r2,r2,0
311
312 vxor v3,v3,v11
313 VPMSUMD(v11,v19,const2)
314 lvx v19,off48,r4
315 VPERM(v19,v19,v19,byteswap)
316 lvx const2,0,r3
317 ori r2,r2,0
318
319 vxor v4,v4,v12
320 VPMSUMD(v12,v20,const1)
321 lvx v20,off64,r4
322 VPERM(v20,v20,v20,byteswap)
323 ori r2,r2,0
324
325 vxor v5,v5,v13
326 VPMSUMD(v13,v21,const1)
327 lvx v21,off80,r4
328 VPERM(v21,v21,v21,byteswap)
329 ori r2,r2,0
330
331 vxor v6,v6,v14
332 VPMSUMD(v14,v22,const1)
333 lvx v22,off96,r4
334 VPERM(v22,v22,v22,byteswap)
335 ori r2,r2,0
336
337 vxor v7,v7,v15
338 VPMSUMD(v15,v23,const1)
339 lvx v23,off112,r4
340 VPERM(v23,v23,v23,byteswap)
341
342 addi r4,r4,8*16
343
344 bdnz 4b
345
346.Lfirst_cool_down:
347 /* First cool down pass */
348 lvx const1,0,r3
349 addi r3,r3,16
350
351 vxor v0,v0,v8
352 VPMSUMD(v8,v16,const1)
353 ori r2,r2,0
354
355 vxor v1,v1,v9
356 VPMSUMD(v9,v17,const1)
357 ori r2,r2,0
358
359 vxor v2,v2,v10
360 VPMSUMD(v10,v18,const1)
361 ori r2,r2,0
362
363 vxor v3,v3,v11
364 VPMSUMD(v11,v19,const1)
365 ori r2,r2,0
366
367 vxor v4,v4,v12
368 VPMSUMD(v12,v20,const1)
369 ori r2,r2,0
370
371 vxor v5,v5,v13
372 VPMSUMD(v13,v21,const1)
373 ori r2,r2,0
374
375 vxor v6,v6,v14
376 VPMSUMD(v14,v22,const1)
377 ori r2,r2,0
378
379 vxor v7,v7,v15
380 VPMSUMD(v15,v23,const1)
381 ori r2,r2,0
382
383.Lsecond_cool_down:
384 /* Second cool down pass */
385 vxor v0,v0,v8
386 vxor v1,v1,v9
387 vxor v2,v2,v10
388 vxor v3,v3,v11
389 vxor v4,v4,v12
390 vxor v5,v5,v13
391 vxor v6,v6,v14
392 vxor v7,v7,v15
393
394#ifdef REFLECT
395 /*
396 * vpmsumd produces a 96 bit result in the least significant bits
397 * of the register. Since we are bit reflected we have to shift it
398 * left 32 bits so it occupies the least significant bits in the
399 * bit reflected domain.
400 */
401 vsldoi v0,v0,zeroes,4
402 vsldoi v1,v1,zeroes,4
403 vsldoi v2,v2,zeroes,4
404 vsldoi v3,v3,zeroes,4
405 vsldoi v4,v4,zeroes,4
406 vsldoi v5,v5,zeroes,4
407 vsldoi v6,v6,zeroes,4
408 vsldoi v7,v7,zeroes,4
409#endif
410
411 /* xor with last 1024 bits */
412 lvx v8,0,r4
413 lvx v9,off16,r4
414 VPERM(v8,v8,v8,byteswap)
415 VPERM(v9,v9,v9,byteswap)
416 lvx v10,off32,r4
417 lvx v11,off48,r4
418 VPERM(v10,v10,v10,byteswap)
419 VPERM(v11,v11,v11,byteswap)
420 lvx v12,off64,r4
421 lvx v13,off80,r4
422 VPERM(v12,v12,v12,byteswap)
423 VPERM(v13,v13,v13,byteswap)
424 lvx v14,off96,r4
425 lvx v15,off112,r4
426 VPERM(v14,v14,v14,byteswap)
427 VPERM(v15,v15,v15,byteswap)
428
429 addi r4,r4,8*16
430
431 vxor v16,v0,v8
432 vxor v17,v1,v9
433 vxor v18,v2,v10
434 vxor v19,v3,v11
435 vxor v20,v4,v12
436 vxor v21,v5,v13
437 vxor v22,v6,v14
438 vxor v23,v7,v15
439
440 li r0,1
441 cmpdi r6,0
442 addi r6,r6,128
443 bne 1b
444
445 /* Work out how many bytes we have left */
446 andi. r5,r5,127
447
448 /* Calculate where in the constant table we need to start */
449 subfic r6,r5,128
450 add r3,r3,r6
451
452 /* How many 16 byte chunks are in the tail */
453 srdi r7,r5,4
454 mtctr r7
455
456 /*
457 * Reduce the previously calculated 1024 bits to 64 bits, shifting
458 * 32 bits to include the trailing 32 bits of zeros
459 */
460 lvx v0,0,r3
461 lvx v1,off16,r3
462 lvx v2,off32,r3
463 lvx v3,off48,r3
464 lvx v4,off64,r3
465 lvx v5,off80,r3
466 lvx v6,off96,r3
467 lvx v7,off112,r3
468 addi r3,r3,8*16
469
470 VPMSUMW(v0,v16,v0)
471 VPMSUMW(v1,v17,v1)
472 VPMSUMW(v2,v18,v2)
473 VPMSUMW(v3,v19,v3)
474 VPMSUMW(v4,v20,v4)
475 VPMSUMW(v5,v21,v5)
476 VPMSUMW(v6,v22,v6)
477 VPMSUMW(v7,v23,v7)
478
479 /* Now reduce the tail (0 - 112 bytes) */
480 cmpdi r7,0
481 beq 1f
482
483 lvx v16,0,r4
484 lvx v17,0,r3
485 VPERM(v16,v16,v16,byteswap)
486 VPMSUMW(v16,v16,v17)
487 vxor v0,v0,v16
488 bdz 1f
489
490 lvx v16,off16,r4
491 lvx v17,off16,r3
492 VPERM(v16,v16,v16,byteswap)
493 VPMSUMW(v16,v16,v17)
494 vxor v0,v0,v16
495 bdz 1f
496
497 lvx v16,off32,r4
498 lvx v17,off32,r3
499 VPERM(v16,v16,v16,byteswap)
500 VPMSUMW(v16,v16,v17)
501 vxor v0,v0,v16
502 bdz 1f
503
504 lvx v16,off48,r4
505 lvx v17,off48,r3
506 VPERM(v16,v16,v16,byteswap)
507 VPMSUMW(v16,v16,v17)
508 vxor v0,v0,v16
509 bdz 1f
510
511 lvx v16,off64,r4
512 lvx v17,off64,r3
513 VPERM(v16,v16,v16,byteswap)
514 VPMSUMW(v16,v16,v17)
515 vxor v0,v0,v16
516 bdz 1f
517
518 lvx v16,off80,r4
519 lvx v17,off80,r3
520 VPERM(v16,v16,v16,byteswap)
521 VPMSUMW(v16,v16,v17)
522 vxor v0,v0,v16
523 bdz 1f
524
525 lvx v16,off96,r4
526 lvx v17,off96,r3
527 VPERM(v16,v16,v16,byteswap)
528 VPMSUMW(v16,v16,v17)
529 vxor v0,v0,v16
530
531 /* Now xor all the parallel chunks together */
5321: vxor v0,v0,v1
533 vxor v2,v2,v3
534 vxor v4,v4,v5
535 vxor v6,v6,v7
536
537 vxor v0,v0,v2
538 vxor v4,v4,v6
539
540 vxor v0,v0,v4
541
542.Lbarrett_reduction:
543 /* Barrett constants */
544 addis r3,r2,.barrett_constants@toc@ha
545 addi r3,r3,.barrett_constants@toc@l
546
547 lvx const1,0,r3
548 lvx const2,off16,r3
549
550 vsldoi v1,v0,v0,8
551 vxor v0,v0,v1 /* xor two 64 bit results together */
552
553#ifdef REFLECT
554 /* shift left one bit */
555 vspltisb v1,1
556 vsl v0,v0,v1
557#endif
558
559 vand v0,v0,mask_64bit
560
561#ifndef REFLECT
562 /*
563 * Now for the Barrett reduction algorithm. The idea is to calculate q,
564 * the multiple of our polynomial that we need to subtract. By
565 * doing the computation 2x bits higher (ie 64 bits) and shifting the
566 * result back down 2x bits, we round down to the nearest multiple.
567 */
568 VPMSUMD(v1,v0,const1) /* ma */
569 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */
570 VPMSUMD(v1,v1,const2) /* qn */
571 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
572
573 /*
574 * Get the result into r3. We need to shift it left 8 bytes:
575 * V0 [ 0 1 2 X ]
576 * V0 [ 0 X 2 3 ]
577 */
578 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */
579#else
580 /*
581 * The reflected version of Barrett reduction. Instead of bit
582 * reflecting our data (which is expensive to do), we bit reflect our
583 * constants and our algorithm, which means the intermediate data in
584 * our vector registers goes from 0-63 instead of 63-0. We can reflect
585 * the algorithm because we don't carry in mod 2 arithmetic.
586 */
587 vand v1,v0,mask_32bit /* bottom 32 bits of a */
588 VPMSUMD(v1,v1,const1) /* ma */
589 vand v1,v1,mask_32bit /* bottom 32bits of ma */
590 VPMSUMD(v1,v1,const2) /* qn */
591 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */
592
593 /*
594 * Since we are bit reflected, the result (ie the low 32 bits) is in
595 * the high 32 bits. We just need to shift it left 4 bytes
596 * V0 [ 0 1 X 3 ]
597 * V0 [ 0 X 2 3 ]
598 */
599 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */
600#endif
601
602 /* Get it into r3 */
603 MFVRD(r3, v0)
604
605.Lout:
606 subi r6,r1,56+10*16
607 subi r7,r1,56+2*16
608
609 lvx v20,0,r6
610 lvx v21,off16,r6
611 lvx v22,off32,r6
612 lvx v23,off48,r6
613 lvx v24,off64,r6
614 lvx v25,off80,r6
615 lvx v26,off96,r6
616 lvx v27,off112,r6
617 lvx v28,0,r7
618 lvx v29,off16,r7
619
620 ld r31,-8(r1)
621 ld r30,-16(r1)
622 ld r29,-24(r1)
623 ld r28,-32(r1)
624 ld r27,-40(r1)
625 ld r26,-48(r1)
626 ld r25,-56(r1)
627
628 blr
629
630.Lfirst_warm_up_done:
631 lvx const1,0,r3
632 addi r3,r3,16
633
634 VPMSUMD(v8,v16,const1)
635 VPMSUMD(v9,v17,const1)
636 VPMSUMD(v10,v18,const1)
637 VPMSUMD(v11,v19,const1)
638 VPMSUMD(v12,v20,const1)
639 VPMSUMD(v13,v21,const1)
640 VPMSUMD(v14,v22,const1)
641 VPMSUMD(v15,v23,const1)
642
643 b .Lsecond_cool_down
644
645.Lshort:
646 cmpdi r5,0
647 beq .Lzero
648
649 addis r3,r2,.short_constants@toc@ha
650 addi r3,r3,.short_constants@toc@l
651
652 /* Calculate where in the constant table we need to start */
653 subfic r6,r5,256
654 add r3,r3,r6
655
656 /* How many 16 byte chunks? */
657 srdi r7,r5,4
658 mtctr r7
659
660 vxor v19,v19,v19
661 vxor v20,v20,v20
662
663 lvx v0,0,r4
664 lvx v16,0,r3
665 VPERM(v0,v0,v16,byteswap)
666 vxor v0,v0,v8 /* xor in initial value */
667 VPMSUMW(v0,v0,v16)
668 bdz .Lv0
669
670 lvx v1,off16,r4
671 lvx v17,off16,r3
672 VPERM(v1,v1,v17,byteswap)
673 VPMSUMW(v1,v1,v17)
674 bdz .Lv1
675
676 lvx v2,off32,r4
677 lvx v16,off32,r3
678 VPERM(v2,v2,v16,byteswap)
679 VPMSUMW(v2,v2,v16)
680 bdz .Lv2
681
682 lvx v3,off48,r4
683 lvx v17,off48,r3
684 VPERM(v3,v3,v17,byteswap)
685 VPMSUMW(v3,v3,v17)
686 bdz .Lv3
687
688 lvx v4,off64,r4
689 lvx v16,off64,r3
690 VPERM(v4,v4,v16,byteswap)
691 VPMSUMW(v4,v4,v16)
692 bdz .Lv4
693
694 lvx v5,off80,r4
695 lvx v17,off80,r3
696 VPERM(v5,v5,v17,byteswap)
697 VPMSUMW(v5,v5,v17)
698 bdz .Lv5
699
700 lvx v6,off96,r4
701 lvx v16,off96,r3
702 VPERM(v6,v6,v16,byteswap)
703 VPMSUMW(v6,v6,v16)
704 bdz .Lv6
705
706 lvx v7,off112,r4
707 lvx v17,off112,r3
708 VPERM(v7,v7,v17,byteswap)
709 VPMSUMW(v7,v7,v17)
710 bdz .Lv7
711
712 addi r3,r3,128
713 addi r4,r4,128
714
715 lvx v8,0,r4
716 lvx v16,0,r3
717 VPERM(v8,v8,v16,byteswap)
718 VPMSUMW(v8,v8,v16)
719 bdz .Lv8
720
721 lvx v9,off16,r4
722 lvx v17,off16,r3
723 VPERM(v9,v9,v17,byteswap)
724 VPMSUMW(v9,v9,v17)
725 bdz .Lv9
726
727 lvx v10,off32,r4
728 lvx v16,off32,r3
729 VPERM(v10,v10,v16,byteswap)
730 VPMSUMW(v10,v10,v16)
731 bdz .Lv10
732
733 lvx v11,off48,r4
734 lvx v17,off48,r3
735 VPERM(v11,v11,v17,byteswap)
736 VPMSUMW(v11,v11,v17)
737 bdz .Lv11
738
739 lvx v12,off64,r4
740 lvx v16,off64,r3
741 VPERM(v12,v12,v16,byteswap)
742 VPMSUMW(v12,v12,v16)
743 bdz .Lv12
744
745 lvx v13,off80,r4
746 lvx v17,off80,r3
747 VPERM(v13,v13,v17,byteswap)
748 VPMSUMW(v13,v13,v17)
749 bdz .Lv13
750
751 lvx v14,off96,r4
752 lvx v16,off96,r3
753 VPERM(v14,v14,v16,byteswap)
754 VPMSUMW(v14,v14,v16)
755 bdz .Lv14
756
757 lvx v15,off112,r4
758 lvx v17,off112,r3
759 VPERM(v15,v15,v17,byteswap)
760 VPMSUMW(v15,v15,v17)
761
762.Lv15: vxor v19,v19,v15
763.Lv14: vxor v20,v20,v14
764.Lv13: vxor v19,v19,v13
765.Lv12: vxor v20,v20,v12
766.Lv11: vxor v19,v19,v11
767.Lv10: vxor v20,v20,v10
768.Lv9: vxor v19,v19,v9
769.Lv8: vxor v20,v20,v8
770.Lv7: vxor v19,v19,v7
771.Lv6: vxor v20,v20,v6
772.Lv5: vxor v19,v19,v5
773.Lv4: vxor v20,v20,v4
774.Lv3: vxor v19,v19,v3
775.Lv2: vxor v20,v20,v2
776.Lv1: vxor v19,v19,v1
777.Lv0: vxor v20,v20,v0
778
779 vxor v0,v19,v20
780
781 b .Lbarrett_reduction
782
783.Lzero:
784 mr r3,r10
785 b .Lout
786
20effc67 787FUNC_END(CRC32_FUNCTION_ASM)