arch/powerpc/crypto/crc32-vpmsum_core.S

   1 /*
   2  * Core of the accelerated CRC algorithm.
   3  * In your file, define the constants and CRC_FUNCTION_NAME
   4  * Then include this file.
   5  *
   6  * Calculate the checksum of data that is 16 byte aligned and a multiple of
   7  * 16 bytes.
   8  *
   9  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  10  * chunks in order to mask the latency of the vpmsum instructions. If we
  11  * have more than 32 kB of data to checksum we repeat this step multiple
  12  * times, passing in the previous 1024 bits.
  13  *
  14  * The next step is to reduce the 1024 bits to 64 bits. This step adds
  15  * 32 bits of 0s to the end - this matches what a CRC does. We just
  16  * calculate constants that land the data in this 32 bits.
  17  *
  18  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  19  * for n = CRC using POWER8 instructions. We use x = 32.
  20  *
  21  * http://en.wikipedia.org/wiki/Barrett_reduction
  22  *
  23  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  24  *
  25  * This program is free software; you can redistribute it and/or
  26  * modify it under the terms of the GNU General Public License
  27  * as published by the Free Software Foundation; either version
  28  * 2 of the License, or (at your option) any later version.
  29 */
  30
  31 #include <asm/ppc_asm.h>
  32 #include <asm/ppc-opcode.h>
  33
  34 #define MAX_SIZE        32768
  35
  36         .text
  37
  38 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
  39 #define BYTESWAP_DATA
  40 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
  41 #define BYTESWAP_DATA
  42 #else
  43 #undef BYTESWAP_DATA
  44 #endif
  45
  46 #define off16           r25
  47 #define off32           r26
  48 #define off48           r27
  49 #define off64           r28
  50 #define off80           r29
  51 #define off96           r30
  52 #define off112          r31
  53
  54 #define const1          v24
  55 #define const2          v25
  56
  57 #define byteswap        v26
  58 #define mask_32bit      v27
  59 #define mask_64bit      v28
  60 #define zeroes          v29
  61
  62 #ifdef BYTESWAP_DATA
  63 #define VPERM(A, B, C, D) vperm A, B, C, D
  64 #else
  65 #define VPERM(A, B, C, D)
  66 #endif
  67
  68 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
  69 FUNC_START(CRC_FUNCTION_NAME)
  70         std     r31,-8(r1)
  71         std     r30,-16(r1)
  72         std     r29,-24(r1)
  73         std     r28,-32(r1)
  74         std     r27,-40(r1)
  75         std     r26,-48(r1)
  76         std     r25,-56(r1)
  77
  78         li      off16,16
  79         li      off32,32
  80         li      off48,48
  81         li      off64,64
  82         li      off80,80
  83         li      off96,96
  84         li      off112,112
  85         li      r0,0
  86
  87         /* Enough room for saving 10 non volatile VMX registers */
  88         subi    r6,r1,56+10*16
  89         subi    r7,r1,56+2*16
  90
  91         stvx    v20,0,r6
  92         stvx    v21,off16,r6
  93         stvx    v22,off32,r6
  94         stvx    v23,off48,r6
  95         stvx    v24,off64,r6
  96         stvx    v25,off80,r6
  97         stvx    v26,off96,r6
  98         stvx    v27,off112,r6
  99         stvx    v28,0,r7
 100         stvx    v29,off16,r7
 101
 102         mr      r10,r3
 103
 104         vxor    zeroes,zeroes,zeroes
 105         vspltisw v0,-1
 106
 107         vsldoi  mask_32bit,zeroes,v0,4
 108         vsldoi  mask_64bit,zeroes,v0,8
 109
 110         /* Get the initial value into v8 */
 111         vxor    v8,v8,v8
 112         MTVRD(v8, R3)
 113 #ifdef REFLECT
 114         vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
 115 #else
 116         vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
 117 #endif
 118
 119 #ifdef BYTESWAP_DATA
 120         addis   r3,r2,.byteswap_constant@toc@ha
 121         addi    r3,r3,.byteswap_constant@toc@l
 122
 123         lvx     byteswap,0,r3
 124         addi    r3,r3,16
 125 #endif
 126
 127         cmpdi   r5,256
 128         blt     .Lshort
 129
 130         rldicr  r6,r5,0,56
 131
 132         /* Checksum in blocks of MAX_SIZE */
 133 1:      lis     r7,MAX_SIZE@h
 134         ori     r7,r7,MAX_SIZE@l
 135         mr      r9,r7
 136         cmpd    r6,r7
 137         bgt     2f
 138         mr      r7,r6
 139 2:      subf    r6,r7,r6
 140
 141         /* our main loop does 128 bytes at a time */
 142         srdi    r7,r7,7
 143
 144         /*
 145          * Work out the offset into the constants table to start at. Each
 146          * constant is 16 bytes, and it is used against 128 bytes of input
 147          * data - 128 / 16 = 8
 148          */
 149         sldi    r8,r7,4
 150         srdi    r9,r9,3
 151         subf    r8,r8,r9
 152
 153         /* We reduce our final 128 bytes in a separate step */
 154         addi    r7,r7,-1
 155         mtctr   r7
 156
 157         addis   r3,r2,.constants@toc@ha
 158         addi    r3,r3,.constants@toc@l
 159
 160         /* Find the start of our constants */
 161         add     r3,r3,r8
 162
 163         /* zero v0-v7 which will contain our checksums */
 164         vxor    v0,v0,v0
 165         vxor    v1,v1,v1
 166         vxor    v2,v2,v2
 167         vxor    v3,v3,v3
 168         vxor    v4,v4,v4
 169         vxor    v5,v5,v5
 170         vxor    v6,v6,v6
 171         vxor    v7,v7,v7
 172
 173         lvx     const1,0,r3
 174
 175         /*
 176          * If we are looping back to consume more data we use the values
 177          * already in v16-v23.
 178          */
 179         cmpdi   r0,1
 180         beq     2f
 181
 182         /* First warm up pass */
 183         lvx     v16,0,r4
 184         lvx     v17,off16,r4
 185         VPERM(v16,v16,v16,byteswap)
 186         VPERM(v17,v17,v17,byteswap)
 187         lvx     v18,off32,r4
 188         lvx     v19,off48,r4
 189         VPERM(v18,v18,v18,byteswap)
 190         VPERM(v19,v19,v19,byteswap)
 191         lvx     v20,off64,r4
 192         lvx     v21,off80,r4
 193         VPERM(v20,v20,v20,byteswap)
 194         VPERM(v21,v21,v21,byteswap)
 195         lvx     v22,off96,r4
 196         lvx     v23,off112,r4
 197         VPERM(v22,v22,v22,byteswap)
 198         VPERM(v23,v23,v23,byteswap)
 199         addi    r4,r4,8*16
 200
 201         /* xor in initial value */
 202         vxor    v16,v16,v8
 203
 204 2:      bdz     .Lfirst_warm_up_done
 205
 206         addi    r3,r3,16
 207         lvx     const2,0,r3
 208
 209         /* Second warm up pass */
 210         VPMSUMD(v8,v16,const1)
 211         lvx     v16,0,r4
 212         VPERM(v16,v16,v16,byteswap)
 213         ori     r2,r2,0
 214
 215         VPMSUMD(v9,v17,const1)
 216         lvx     v17,off16,r4
 217         VPERM(v17,v17,v17,byteswap)
 218         ori     r2,r2,0
 219
 220         VPMSUMD(v10,v18,const1)
 221         lvx     v18,off32,r4
 222         VPERM(v18,v18,v18,byteswap)
 223         ori     r2,r2,0
 224
 225         VPMSUMD(v11,v19,const1)
 226         lvx     v19,off48,r4
 227         VPERM(v19,v19,v19,byteswap)
 228         ori     r2,r2,0
 229
 230         VPMSUMD(v12,v20,const1)
 231         lvx     v20,off64,r4
 232         VPERM(v20,v20,v20,byteswap)
 233         ori     r2,r2,0
 234
 235         VPMSUMD(v13,v21,const1)
 236         lvx     v21,off80,r4
 237         VPERM(v21,v21,v21,byteswap)
 238         ori     r2,r2,0
 239
 240         VPMSUMD(v14,v22,const1)
 241         lvx     v22,off96,r4
 242         VPERM(v22,v22,v22,byteswap)
 243         ori     r2,r2,0
 244
 245         VPMSUMD(v15,v23,const1)
 246         lvx     v23,off112,r4
 247         VPERM(v23,v23,v23,byteswap)
 248
 249         addi    r4,r4,8*16
 250
 251         bdz     .Lfirst_cool_down
 252
 253         /*
 254          * main loop. We modulo schedule it such that it takes three iterations
 255          * to complete - first iteration load, second iteration vpmsum, third
 256          * iteration xor.
 257          */
 258         .balign 16
 259 4:      lvx     const1,0,r3
 260         addi    r3,r3,16
 261         ori     r2,r2,0
 262
 263         vxor    v0,v0,v8
 264         VPMSUMD(v8,v16,const2)
 265         lvx     v16,0,r4
 266         VPERM(v16,v16,v16,byteswap)
 267         ori     r2,r2,0
 268
 269         vxor    v1,v1,v9
 270         VPMSUMD(v9,v17,const2)
 271         lvx     v17,off16,r4
 272         VPERM(v17,v17,v17,byteswap)
 273         ori     r2,r2,0
 274
 275         vxor    v2,v2,v10
 276         VPMSUMD(v10,v18,const2)
 277         lvx     v18,off32,r4
 278         VPERM(v18,v18,v18,byteswap)
 279         ori     r2,r2,0
 280
 281         vxor    v3,v3,v11
 282         VPMSUMD(v11,v19,const2)
 283         lvx     v19,off48,r4
 284         VPERM(v19,v19,v19,byteswap)
 285         lvx     const2,0,r3
 286         ori     r2,r2,0
 287
 288         vxor    v4,v4,v12
 289         VPMSUMD(v12,v20,const1)
 290         lvx     v20,off64,r4
 291         VPERM(v20,v20,v20,byteswap)
 292         ori     r2,r2,0
 293
 294         vxor    v5,v5,v13
 295         VPMSUMD(v13,v21,const1)
 296         lvx     v21,off80,r4
 297         VPERM(v21,v21,v21,byteswap)
 298         ori     r2,r2,0
 299
 300         vxor    v6,v6,v14
 301         VPMSUMD(v14,v22,const1)
 302         lvx     v22,off96,r4
 303         VPERM(v22,v22,v22,byteswap)
 304         ori     r2,r2,0
 305
 306         vxor    v7,v7,v15
 307         VPMSUMD(v15,v23,const1)
 308         lvx     v23,off112,r4
 309         VPERM(v23,v23,v23,byteswap)
 310
 311         addi    r4,r4,8*16
 312
 313         bdnz    4b
 314
 315 .Lfirst_cool_down:
 316         /* First cool down pass */
 317         lvx     const1,0,r3
 318         addi    r3,r3,16
 319
 320         vxor    v0,v0,v8
 321         VPMSUMD(v8,v16,const1)
 322         ori     r2,r2,0
 323
 324         vxor    v1,v1,v9
 325         VPMSUMD(v9,v17,const1)
 326         ori     r2,r2,0
 327
 328         vxor    v2,v2,v10
 329         VPMSUMD(v10,v18,const1)
 330         ori     r2,r2,0
 331
 332         vxor    v3,v3,v11
 333         VPMSUMD(v11,v19,const1)
 334         ori     r2,r2,0
 335
 336         vxor    v4,v4,v12
 337         VPMSUMD(v12,v20,const1)
 338         ori     r2,r2,0
 339
 340         vxor    v5,v5,v13
 341         VPMSUMD(v13,v21,const1)
 342         ori     r2,r2,0
 343
 344         vxor    v6,v6,v14
 345         VPMSUMD(v14,v22,const1)
 346         ori     r2,r2,0
 347
 348         vxor    v7,v7,v15
 349         VPMSUMD(v15,v23,const1)
 350         ori     r2,r2,0
 351
 352 .Lsecond_cool_down:
 353         /* Second cool down pass */
 354         vxor    v0,v0,v8
 355         vxor    v1,v1,v9
 356         vxor    v2,v2,v10
 357         vxor    v3,v3,v11
 358         vxor    v4,v4,v12
 359         vxor    v5,v5,v13
 360         vxor    v6,v6,v14
 361         vxor    v7,v7,v15
 362
 363 #ifdef REFLECT
 364         /*
 365          * vpmsumd produces a 96 bit result in the least significant bits
 366          * of the register. Since we are bit reflected we have to shift it
 367          * left 32 bits so it occupies the least significant bits in the
 368          * bit reflected domain.
 369          */
 370         vsldoi  v0,v0,zeroes,4
 371         vsldoi  v1,v1,zeroes,4
 372         vsldoi  v2,v2,zeroes,4
 373         vsldoi  v3,v3,zeroes,4
 374         vsldoi  v4,v4,zeroes,4
 375         vsldoi  v5,v5,zeroes,4
 376         vsldoi  v6,v6,zeroes,4
 377         vsldoi  v7,v7,zeroes,4
 378 #endif
 379
 380         /* xor with last 1024 bits */
 381         lvx     v8,0,r4
 382         lvx     v9,off16,r4
 383         VPERM(v8,v8,v8,byteswap)
 384         VPERM(v9,v9,v9,byteswap)
 385         lvx     v10,off32,r4
 386         lvx     v11,off48,r4
 387         VPERM(v10,v10,v10,byteswap)
 388         VPERM(v11,v11,v11,byteswap)
 389         lvx     v12,off64,r4
 390         lvx     v13,off80,r4
 391         VPERM(v12,v12,v12,byteswap)
 392         VPERM(v13,v13,v13,byteswap)
 393         lvx     v14,off96,r4
 394         lvx     v15,off112,r4
 395         VPERM(v14,v14,v14,byteswap)
 396         VPERM(v15,v15,v15,byteswap)
 397
 398         addi    r4,r4,8*16
 399
 400         vxor    v16,v0,v8
 401         vxor    v17,v1,v9
 402         vxor    v18,v2,v10
 403         vxor    v19,v3,v11
 404         vxor    v20,v4,v12
 405         vxor    v21,v5,v13
 406         vxor    v22,v6,v14
 407         vxor    v23,v7,v15
 408
 409         li      r0,1
 410         cmpdi   r6,0
 411         addi    r6,r6,128
 412         bne     1b
 413
 414         /* Work out how many bytes we have left */
 415         andi.   r5,r5,127
 416
 417         /* Calculate where in the constant table we need to start */
 418         subfic  r6,r5,128
 419         add     r3,r3,r6
 420
 421         /* How many 16 byte chunks are in the tail */
 422         srdi    r7,r5,4
 423         mtctr   r7
 424
 425         /*
 426          * Reduce the previously calculated 1024 bits to 64 bits, shifting
 427          * 32 bits to include the trailing 32 bits of zeros
 428          */
 429         lvx     v0,0,r3
 430         lvx     v1,off16,r3
 431         lvx     v2,off32,r3
 432         lvx     v3,off48,r3
 433         lvx     v4,off64,r3
 434         lvx     v5,off80,r3
 435         lvx     v6,off96,r3
 436         lvx     v7,off112,r3
 437         addi    r3,r3,8*16
 438
 439         VPMSUMW(v0,v16,v0)
 440         VPMSUMW(v1,v17,v1)
 441         VPMSUMW(v2,v18,v2)
 442         VPMSUMW(v3,v19,v3)
 443         VPMSUMW(v4,v20,v4)
 444         VPMSUMW(v5,v21,v5)
 445         VPMSUMW(v6,v22,v6)
 446         VPMSUMW(v7,v23,v7)
 447
 448         /* Now reduce the tail (0 - 112 bytes) */
 449         cmpdi   r7,0
 450         beq     1f
 451
 452         lvx     v16,0,r4
 453         lvx     v17,0,r3
 454         VPERM(v16,v16,v16,byteswap)
 455         VPMSUMW(v16,v16,v17)
 456         vxor    v0,v0,v16
 457         bdz     1f
 458
 459         lvx     v16,off16,r4
 460         lvx     v17,off16,r3
 461         VPERM(v16,v16,v16,byteswap)
 462         VPMSUMW(v16,v16,v17)
 463         vxor    v0,v0,v16
 464         bdz     1f
 465
 466         lvx     v16,off32,r4
 467         lvx     v17,off32,r3
 468         VPERM(v16,v16,v16,byteswap)
 469         VPMSUMW(v16,v16,v17)
 470         vxor    v0,v0,v16
 471         bdz     1f
 472
 473         lvx     v16,off48,r4
 474         lvx     v17,off48,r3
 475         VPERM(v16,v16,v16,byteswap)
 476         VPMSUMW(v16,v16,v17)
 477         vxor    v0,v0,v16
 478         bdz     1f
 479
 480         lvx     v16,off64,r4
 481         lvx     v17,off64,r3
 482         VPERM(v16,v16,v16,byteswap)
 483         VPMSUMW(v16,v16,v17)
 484         vxor    v0,v0,v16
 485         bdz     1f
 486
 487         lvx     v16,off80,r4
 488         lvx     v17,off80,r3
 489         VPERM(v16,v16,v16,byteswap)
 490         VPMSUMW(v16,v16,v17)
 491         vxor    v0,v0,v16
 492         bdz     1f
 493
 494         lvx     v16,off96,r4
 495         lvx     v17,off96,r3
 496         VPERM(v16,v16,v16,byteswap)
 497         VPMSUMW(v16,v16,v17)
 498         vxor    v0,v0,v16
 499
 500         /* Now xor all the parallel chunks together */
 501 1:      vxor    v0,v0,v1
 502         vxor    v2,v2,v3
 503         vxor    v4,v4,v5
 504         vxor    v6,v6,v7
 505
 506         vxor    v0,v0,v2
 507         vxor    v4,v4,v6
 508
 509         vxor    v0,v0,v4
 510
 511 .Lbarrett_reduction:
 512         /* Barrett constants */
 513         addis   r3,r2,.barrett_constants@toc@ha
 514         addi    r3,r3,.barrett_constants@toc@l
 515
 516         lvx     const1,0,r3
 517         lvx     const2,off16,r3
 518
 519         vsldoi  v1,v0,v0,8
 520         vxor    v0,v0,v1                /* xor two 64 bit results together */
 521
 522 #ifdef REFLECT
 523         /* shift left one bit */
 524         vspltisb v1,1
 525         vsl     v0,v0,v1
 526 #endif
 527
 528         vand    v0,v0,mask_64bit
 529 #ifndef REFLECT
 530         /*
 531          * Now for the Barrett reduction algorithm. The idea is to calculate q,
 532          * the multiple of our polynomial that we need to subtract. By
 533          * doing the computation 2x bits higher (ie 64 bits) and shifting the
 534          * result back down 2x bits, we round down to the nearest multiple.
 535          */
 536         VPMSUMD(v1,v0,const1)   /* ma */
 537         vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
 538         VPMSUMD(v1,v1,const2)   /* qn */
 539         vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
 540
 541         /*
 542          * Get the result into r3. We need to shift it left 8 bytes:
 543          * V0 [ 0 1 2 X ]
 544          * V0 [ 0 X 2 3 ]
 545          */
 546         vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
 547 #else
 548         /*
 549          * The reflected version of Barrett reduction. Instead of bit
 550          * reflecting our data (which is expensive to do), we bit reflect our
 551          * constants and our algorithm, which means the intermediate data in
 552          * our vector registers goes from 0-63 instead of 63-0. We can reflect
 553          * the algorithm because we don't carry in mod 2 arithmetic.
 554          */
 555         vand    v1,v0,mask_32bit        /* bottom 32 bits of a */
 556         VPMSUMD(v1,v1,const1)           /* ma */
 557         vand    v1,v1,mask_32bit        /* bottom 32bits of ma */
 558         VPMSUMD(v1,v1,const2)           /* qn */
 559         vxor    v0,v0,v1                /* a - qn, subtraction is xor in GF(2) */
 560
 561         /*
 562          * Since we are bit reflected, the result (ie the low 32 bits) is in
 563          * the high 32 bits. We just need to shift it left 4 bytes
 564          * V0 [ 0 1 X 3 ]
 565          * V0 [ 0 X 2 3 ]
 566          */
 567         vsldoi  v0,v0,zeroes,4          /* shift result into top 64 bits of */
 568 #endif
 569
 570         /* Get it into r3 */
 571         MFVRD(R3, v0)
 572
 573 .Lout:
 574         subi    r6,r1,56+10*16
 575         subi    r7,r1,56+2*16
 576
 577         lvx     v20,0,r6
 578         lvx     v21,off16,r6
 579         lvx     v22,off32,r6
 580         lvx     v23,off48,r6
 581         lvx     v24,off64,r6
 582         lvx     v25,off80,r6
 583         lvx     v26,off96,r6
 584         lvx     v27,off112,r6
 585         lvx     v28,0,r7
 586         lvx     v29,off16,r7
 587
 588         ld      r31,-8(r1)
 589         ld      r30,-16(r1)
 590         ld      r29,-24(r1)
 591         ld      r28,-32(r1)
 592         ld      r27,-40(r1)
 593         ld      r26,-48(r1)
 594         ld      r25,-56(r1)
 595
 596         blr
 597
 598 .Lfirst_warm_up_done:
 599         lvx     const1,0,r3
 600         addi    r3,r3,16
 601
 602         VPMSUMD(v8,v16,const1)
 603         VPMSUMD(v9,v17,const1)
 604         VPMSUMD(v10,v18,const1)
 605         VPMSUMD(v11,v19,const1)
 606         VPMSUMD(v12,v20,const1)
 607         VPMSUMD(v13,v21,const1)
 608         VPMSUMD(v14,v22,const1)
 609         VPMSUMD(v15,v23,const1)
 610
 611         b       .Lsecond_cool_down
 612
 613 .Lshort:
 614         cmpdi   r5,0
 615         beq     .Lzero
 616
 617         addis   r3,r2,.short_constants@toc@ha
 618         addi    r3,r3,.short_constants@toc@l
 619
 620         /* Calculate where in the constant table we need to start */
 621         subfic  r6,r5,256
 622         add     r3,r3,r6
 623
 624         /* How many 16 byte chunks? */
 625         srdi    r7,r5,4
 626         mtctr   r7
 627
 628         vxor    v19,v19,v19
 629         vxor    v20,v20,v20
 630
 631         lvx     v0,0,r4
 632         lvx     v16,0,r3
 633         VPERM(v0,v0,v16,byteswap)
 634         vxor    v0,v0,v8        /* xor in initial value */
 635         VPMSUMW(v0,v0,v16)
 636         bdz     .Lv0
 637
 638         lvx     v1,off16,r4
 639         lvx     v17,off16,r3
 640         VPERM(v1,v1,v17,byteswap)
 641         VPMSUMW(v1,v1,v17)
 642         bdz     .Lv1
 643
 644         lvx     v2,off32,r4
 645         lvx     v16,off32,r3
 646         VPERM(v2,v2,v16,byteswap)
 647         VPMSUMW(v2,v2,v16)
 648         bdz     .Lv2
 649
 650         lvx     v3,off48,r4
 651         lvx     v17,off48,r3
 652         VPERM(v3,v3,v17,byteswap)
 653         VPMSUMW(v3,v3,v17)
 654         bdz     .Lv3
 655
 656         lvx     v4,off64,r4
 657         lvx     v16,off64,r3
 658         VPERM(v4,v4,v16,byteswap)
 659         VPMSUMW(v4,v4,v16)
 660         bdz     .Lv4
 661
 662         lvx     v5,off80,r4
 663         lvx     v17,off80,r3
 664         VPERM(v5,v5,v17,byteswap)
 665         VPMSUMW(v5,v5,v17)
 666         bdz     .Lv5
 667
 668         lvx     v6,off96,r4
 669         lvx     v16,off96,r3
 670         VPERM(v6,v6,v16,byteswap)
 671         VPMSUMW(v6,v6,v16)
 672         bdz     .Lv6
 673
 674         lvx     v7,off112,r4
 675         lvx     v17,off112,r3
 676         VPERM(v7,v7,v17,byteswap)
 677         VPMSUMW(v7,v7,v17)
 678         bdz     .Lv7
 679
 680         addi    r3,r3,128
 681         addi    r4,r4,128
 682
 683         lvx     v8,0,r4
 684         lvx     v16,0,r3
 685         VPERM(v8,v8,v16,byteswap)
 686         VPMSUMW(v8,v8,v16)
 687         bdz     .Lv8
 688
 689         lvx     v9,off16,r4
 690         lvx     v17,off16,r3
 691         VPERM(v9,v9,v17,byteswap)
 692         VPMSUMW(v9,v9,v17)
 693         bdz     .Lv9
 694
 695         lvx     v10,off32,r4
 696         lvx     v16,off32,r3
 697         VPERM(v10,v10,v16,byteswap)
 698         VPMSUMW(v10,v10,v16)
 699         bdz     .Lv10
 700
 701         lvx     v11,off48,r4
 702         lvx     v17,off48,r3
 703         VPERM(v11,v11,v17,byteswap)
 704         VPMSUMW(v11,v11,v17)
 705         bdz     .Lv11
 706
 707         lvx     v12,off64,r4
 708         lvx     v16,off64,r3
 709         VPERM(v12,v12,v16,byteswap)
 710         VPMSUMW(v12,v12,v16)
 711         bdz     .Lv12
 712
 713         lvx     v13,off80,r4
 714         lvx     v17,off80,r3
 715         VPERM(v13,v13,v17,byteswap)
 716         VPMSUMW(v13,v13,v17)
 717         bdz     .Lv13
 718
 719         lvx     v14,off96,r4
 720         lvx     v16,off96,r3
 721         VPERM(v14,v14,v16,byteswap)
 722         VPMSUMW(v14,v14,v16)
 723         bdz     .Lv14
 724
 725         lvx     v15,off112,r4
 726         lvx     v17,off112,r3
 727         VPERM(v15,v15,v17,byteswap)
 728         VPMSUMW(v15,v15,v17)
 729
 730 .Lv15:  vxor    v19,v19,v15
 731 .Lv14:  vxor    v20,v20,v14
 732 .Lv13:  vxor    v19,v19,v13
 733 .Lv12:  vxor    v20,v20,v12
 734 .Lv11:  vxor    v19,v19,v11
 735 .Lv10:  vxor    v20,v20,v10
 736 .Lv9:   vxor    v19,v19,v9
 737 .Lv8:   vxor    v20,v20,v8
 738 .Lv7:   vxor    v19,v19,v7
 739 .Lv6:   vxor    v20,v20,v6
 740 .Lv5:   vxor    v19,v19,v5
 741 .Lv4:   vxor    v20,v20,v4
 742 .Lv3:   vxor    v19,v19,v3
 743 .Lv2:   vxor    v20,v20,v2
 744 .Lv1:   vxor    v19,v19,v1
 745 .Lv0:   vxor    v20,v20,v0
 746
 747         vxor    v0,v19,v20
 748
 749         b       .Lbarrett_reduction
 750
 751 .Lzero:
 752         mr      r3,r10
 753         b       .Lout
 754
 755 FUNC_END(CRC_FUNCTION_NAME)