]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | /* |
2 | * Calculate the checksum of data that is 16 byte aligned and a multiple of | |
3 | * 16 bytes. | |
4 | * | |
5 | * The first step is to reduce it to 1024 bits. We do this in 8 parallel | |
6 | * chunks in order to mask the latency of the vpmsum instructions. If we | |
7 | * have more than 32 kB of data to checksum we repeat this step multiple | |
8 | * times, passing in the previous 1024 bits. | |
9 | * | |
10 | * The next step is to reduce the 1024 bits to 64 bits. This step adds | |
11 | * 32 bits of 0s to the end - this matches what a CRC does. We just | |
12 | * calculate constants that land the data in this 32 bits. | |
13 | * | |
14 | * We then use fixed point Barrett reduction to compute a mod n over GF(2) | |
15 | * for n = CRC using POWER8 instructions. We use x = 32. | |
16 | * | |
17 | * http://en.wikipedia.org/wiki/Barrett_reduction | |
18 | * | |
19 | * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM | |
7c673cae FG |
20 | * |
21 | * This program is free software; you can redistribute it and/or | |
20effc67 TL |
22 | * modify it under the terms of either: |
23 | * | |
24 | * a) the GNU General Public License as published by the Free Software | |
25 | * Foundation; either version 2 of the License, or (at your option) | |
26 | * any later version, or | |
27 | * b) the Apache License, Version 2.0 | |
7c673cae | 28 | */ |
20effc67 TL |
29 | |
30 | #if defined (__clang__) | |
31 | #ifndef __ALTIVEC__ | |
32 | #define __ALTIVEC__ | |
33 | #endif | |
34 | #include "ppc-asm.h" | |
35 | #else | |
7c673cae | 36 | #include <ppc-asm.h> |
20effc67 TL |
37 | #endif |
38 | #include "ppc-opcode.h" | |
7c673cae FG |
39 | |
40 | #undef toc | |
41 | ||
42 | #ifndef r1 | |
43 | #define r1 1 | |
44 | #endif | |
45 | ||
46 | #ifndef r2 | |
47 | #define r2 2 | |
48 | #endif | |
49 | ||
50 | .section .rodata | |
51 | .balign 16 | |
52 | ||
53 | .byteswap_constant: | |
54 | /* byte reverse permute constant */ | |
55 | .octa 0x0F0E0D0C0B0A09080706050403020100 | |
56 | ||
20effc67 TL |
57 | #ifdef CRC32_CONSTANTS_HEADER |
58 | #include CRC32_CONSTANTS_HEADER | |
59 | #else | |
7c673cae | 60 | #include "crc32c_ppc_constants.h" |
20effc67 | 61 | #endif |
7c673cae FG |
62 | |
63 | .text | |
64 | ||
65 | #if defined(__BIG_ENDIAN__) && defined(REFLECT) | |
66 | #define BYTESWAP_DATA | |
67 | #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) | |
68 | #define BYTESWAP_DATA | |
69 | #else | |
70 | #undef BYTESWAP_DATA | |
71 | #endif | |
72 | ||
73 | #define off16 r25 | |
74 | #define off32 r26 | |
75 | #define off48 r27 | |
76 | #define off64 r28 | |
77 | #define off80 r29 | |
78 | #define off96 r30 | |
79 | #define off112 r31 | |
80 | ||
81 | #define const1 v24 | |
82 | #define const2 v25 | |
83 | ||
84 | #define byteswap v26 | |
85 | #define mask_32bit v27 | |
86 | #define mask_64bit v28 | |
87 | #define zeroes v29 | |
88 | ||
89 | #ifdef BYTESWAP_DATA | |
90 | #define VPERM(A, B, C, D) vperm A, B, C, D | |
91 | #else | |
92 | #define VPERM(A, B, C, D) | |
93 | #endif | |
94 | ||
20effc67 TL |
95 | #ifndef CRC32_FUNCTION_ASM |
96 | #define CRC32_FUNCTION_ASM __crc32_vpmsum | |
97 | #endif | |
98 | ||
7c673cae | 99 | /* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ |
20effc67 | 100 | FUNC_START(CRC32_FUNCTION_ASM) |
7c673cae FG |
101 | std r31,-8(r1) |
102 | std r30,-16(r1) | |
103 | std r29,-24(r1) | |
104 | std r28,-32(r1) | |
105 | std r27,-40(r1) | |
106 | std r26,-48(r1) | |
107 | std r25,-56(r1) | |
108 | ||
109 | li off16,16 | |
110 | li off32,32 | |
111 | li off48,48 | |
112 | li off64,64 | |
113 | li off80,80 | |
114 | li off96,96 | |
115 | li off112,112 | |
116 | li r0,0 | |
117 | ||
118 | /* Enough room for saving 10 non volatile VMX registers */ | |
119 | subi r6,r1,56+10*16 | |
120 | subi r7,r1,56+2*16 | |
121 | ||
122 | stvx v20,0,r6 | |
123 | stvx v21,off16,r6 | |
124 | stvx v22,off32,r6 | |
125 | stvx v23,off48,r6 | |
126 | stvx v24,off64,r6 | |
127 | stvx v25,off80,r6 | |
128 | stvx v26,off96,r6 | |
129 | stvx v27,off112,r6 | |
130 | stvx v28,0,r7 | |
131 | stvx v29,off16,r7 | |
132 | ||
133 | mr r10,r3 | |
134 | ||
135 | vxor zeroes,zeroes,zeroes | |
136 | vspltisw v0,-1 | |
137 | ||
138 | vsldoi mask_32bit,zeroes,v0,4 | |
139 | vsldoi mask_64bit,zeroes,v0,8 | |
140 | ||
141 | /* Get the initial value into v8 */ | |
142 | vxor v8,v8,v8 | |
143 | MTVRD(v8, r3) | |
144 | #ifdef REFLECT | |
145 | vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ | |
146 | #else | |
147 | vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ | |
148 | #endif | |
149 | ||
150 | #ifdef BYTESWAP_DATA | |
151 | addis r3,r2,.byteswap_constant@toc@ha | |
152 | addi r3,r3,.byteswap_constant@toc@l | |
153 | ||
154 | lvx byteswap,0,r3 | |
155 | addi r3,r3,16 | |
156 | #endif | |
157 | ||
158 | cmpdi r5,256 | |
159 | blt .Lshort | |
160 | ||
161 | rldicr r6,r5,0,56 | |
162 | ||
163 | /* Checksum in blocks of MAX_SIZE */ | |
164 | 1: lis r7,MAX_SIZE@h | |
165 | ori r7,r7,MAX_SIZE@l | |
166 | mr r9,r7 | |
167 | cmpd r6,r7 | |
168 | bgt 2f | |
169 | mr r7,r6 | |
170 | 2: subf r6,r7,r6 | |
171 | ||
172 | /* our main loop does 128 bytes at a time */ | |
173 | srdi r7,r7,7 | |
174 | ||
175 | /* | |
176 | * Work out the offset into the constants table to start at. Each | |
177 | * constant is 16 bytes, and it is used against 128 bytes of input | |
178 | * data - 128 / 16 = 8 | |
179 | */ | |
180 | sldi r8,r7,4 | |
181 | srdi r9,r9,3 | |
182 | subf r8,r8,r9 | |
183 | ||
184 | /* We reduce our final 128 bytes in a separate step */ | |
185 | addi r7,r7,-1 | |
186 | mtctr r7 | |
187 | ||
188 | addis r3,r2,.constants@toc@ha | |
189 | addi r3,r3,.constants@toc@l | |
190 | ||
191 | /* Find the start of our constants */ | |
192 | add r3,r3,r8 | |
193 | ||
194 | /* zero v0-v7 which will contain our checksums */ | |
195 | vxor v0,v0,v0 | |
196 | vxor v1,v1,v1 | |
197 | vxor v2,v2,v2 | |
198 | vxor v3,v3,v3 | |
199 | vxor v4,v4,v4 | |
200 | vxor v5,v5,v5 | |
201 | vxor v6,v6,v6 | |
202 | vxor v7,v7,v7 | |
203 | ||
204 | lvx const1,0,r3 | |
205 | ||
206 | /* | |
207 | * If we are looping back to consume more data we use the values | |
208 | * already in v16-v23. | |
209 | */ | |
210 | cmpdi r0,1 | |
211 | beq 2f | |
212 | ||
213 | /* First warm up pass */ | |
214 | lvx v16,0,r4 | |
215 | lvx v17,off16,r4 | |
216 | VPERM(v16,v16,v16,byteswap) | |
217 | VPERM(v17,v17,v17,byteswap) | |
218 | lvx v18,off32,r4 | |
219 | lvx v19,off48,r4 | |
220 | VPERM(v18,v18,v18,byteswap) | |
221 | VPERM(v19,v19,v19,byteswap) | |
222 | lvx v20,off64,r4 | |
223 | lvx v21,off80,r4 | |
224 | VPERM(v20,v20,v20,byteswap) | |
225 | VPERM(v21,v21,v21,byteswap) | |
226 | lvx v22,off96,r4 | |
227 | lvx v23,off112,r4 | |
228 | VPERM(v22,v22,v22,byteswap) | |
229 | VPERM(v23,v23,v23,byteswap) | |
230 | addi r4,r4,8*16 | |
231 | ||
232 | /* xor in initial value */ | |
233 | vxor v16,v16,v8 | |
234 | ||
235 | 2: bdz .Lfirst_warm_up_done | |
236 | ||
237 | addi r3,r3,16 | |
238 | lvx const2,0,r3 | |
239 | ||
240 | /* Second warm up pass */ | |
241 | VPMSUMD(v8,v16,const1) | |
242 | lvx v16,0,r4 | |
243 | VPERM(v16,v16,v16,byteswap) | |
244 | ori r2,r2,0 | |
245 | ||
246 | VPMSUMD(v9,v17,const1) | |
247 | lvx v17,off16,r4 | |
248 | VPERM(v17,v17,v17,byteswap) | |
249 | ori r2,r2,0 | |
250 | ||
251 | VPMSUMD(v10,v18,const1) | |
252 | lvx v18,off32,r4 | |
253 | VPERM(v18,v18,v18,byteswap) | |
254 | ori r2,r2,0 | |
255 | ||
256 | VPMSUMD(v11,v19,const1) | |
257 | lvx v19,off48,r4 | |
258 | VPERM(v19,v19,v19,byteswap) | |
259 | ori r2,r2,0 | |
260 | ||
261 | VPMSUMD(v12,v20,const1) | |
262 | lvx v20,off64,r4 | |
263 | VPERM(v20,v20,v20,byteswap) | |
264 | ori r2,r2,0 | |
265 | ||
266 | VPMSUMD(v13,v21,const1) | |
267 | lvx v21,off80,r4 | |
268 | VPERM(v21,v21,v21,byteswap) | |
269 | ori r2,r2,0 | |
270 | ||
271 | VPMSUMD(v14,v22,const1) | |
272 | lvx v22,off96,r4 | |
273 | VPERM(v22,v22,v22,byteswap) | |
274 | ori r2,r2,0 | |
275 | ||
276 | VPMSUMD(v15,v23,const1) | |
277 | lvx v23,off112,r4 | |
278 | VPERM(v23,v23,v23,byteswap) | |
279 | ||
280 | addi r4,r4,8*16 | |
281 | ||
282 | bdz .Lfirst_cool_down | |
283 | ||
284 | /* | |
285 | * main loop. We modulo schedule it such that it takes three iterations | |
286 | * to complete - first iteration load, second iteration vpmsum, third | |
287 | * iteration xor. | |
288 | */ | |
289 | .balign 16 | |
290 | 4: lvx const1,0,r3 | |
291 | addi r3,r3,16 | |
292 | ori r2,r2,0 | |
293 | ||
294 | vxor v0,v0,v8 | |
295 | VPMSUMD(v8,v16,const2) | |
296 | lvx v16,0,r4 | |
297 | VPERM(v16,v16,v16,byteswap) | |
298 | ori r2,r2,0 | |
299 | ||
300 | vxor v1,v1,v9 | |
301 | VPMSUMD(v9,v17,const2) | |
302 | lvx v17,off16,r4 | |
303 | VPERM(v17,v17,v17,byteswap) | |
304 | ori r2,r2,0 | |
305 | ||
306 | vxor v2,v2,v10 | |
307 | VPMSUMD(v10,v18,const2) | |
308 | lvx v18,off32,r4 | |
309 | VPERM(v18,v18,v18,byteswap) | |
310 | ori r2,r2,0 | |
311 | ||
312 | vxor v3,v3,v11 | |
313 | VPMSUMD(v11,v19,const2) | |
314 | lvx v19,off48,r4 | |
315 | VPERM(v19,v19,v19,byteswap) | |
316 | lvx const2,0,r3 | |
317 | ori r2,r2,0 | |
318 | ||
319 | vxor v4,v4,v12 | |
320 | VPMSUMD(v12,v20,const1) | |
321 | lvx v20,off64,r4 | |
322 | VPERM(v20,v20,v20,byteswap) | |
323 | ori r2,r2,0 | |
324 | ||
325 | vxor v5,v5,v13 | |
326 | VPMSUMD(v13,v21,const1) | |
327 | lvx v21,off80,r4 | |
328 | VPERM(v21,v21,v21,byteswap) | |
329 | ori r2,r2,0 | |
330 | ||
331 | vxor v6,v6,v14 | |
332 | VPMSUMD(v14,v22,const1) | |
333 | lvx v22,off96,r4 | |
334 | VPERM(v22,v22,v22,byteswap) | |
335 | ori r2,r2,0 | |
336 | ||
337 | vxor v7,v7,v15 | |
338 | VPMSUMD(v15,v23,const1) | |
339 | lvx v23,off112,r4 | |
340 | VPERM(v23,v23,v23,byteswap) | |
341 | ||
342 | addi r4,r4,8*16 | |
343 | ||
344 | bdnz 4b | |
345 | ||
346 | .Lfirst_cool_down: | |
347 | /* First cool down pass */ | |
348 | lvx const1,0,r3 | |
349 | addi r3,r3,16 | |
350 | ||
351 | vxor v0,v0,v8 | |
352 | VPMSUMD(v8,v16,const1) | |
353 | ori r2,r2,0 | |
354 | ||
355 | vxor v1,v1,v9 | |
356 | VPMSUMD(v9,v17,const1) | |
357 | ori r2,r2,0 | |
358 | ||
359 | vxor v2,v2,v10 | |
360 | VPMSUMD(v10,v18,const1) | |
361 | ori r2,r2,0 | |
362 | ||
363 | vxor v3,v3,v11 | |
364 | VPMSUMD(v11,v19,const1) | |
365 | ori r2,r2,0 | |
366 | ||
367 | vxor v4,v4,v12 | |
368 | VPMSUMD(v12,v20,const1) | |
369 | ori r2,r2,0 | |
370 | ||
371 | vxor v5,v5,v13 | |
372 | VPMSUMD(v13,v21,const1) | |
373 | ori r2,r2,0 | |
374 | ||
375 | vxor v6,v6,v14 | |
376 | VPMSUMD(v14,v22,const1) | |
377 | ori r2,r2,0 | |
378 | ||
379 | vxor v7,v7,v15 | |
380 | VPMSUMD(v15,v23,const1) | |
381 | ori r2,r2,0 | |
382 | ||
383 | .Lsecond_cool_down: | |
384 | /* Second cool down pass */ | |
385 | vxor v0,v0,v8 | |
386 | vxor v1,v1,v9 | |
387 | vxor v2,v2,v10 | |
388 | vxor v3,v3,v11 | |
389 | vxor v4,v4,v12 | |
390 | vxor v5,v5,v13 | |
391 | vxor v6,v6,v14 | |
392 | vxor v7,v7,v15 | |
393 | ||
394 | #ifdef REFLECT | |
395 | /* | |
396 | * vpmsumd produces a 96 bit result in the least significant bits | |
397 | * of the register. Since we are bit reflected we have to shift it | |
398 | * left 32 bits so it occupies the least significant bits in the | |
399 | * bit reflected domain. | |
400 | */ | |
401 | vsldoi v0,v0,zeroes,4 | |
402 | vsldoi v1,v1,zeroes,4 | |
403 | vsldoi v2,v2,zeroes,4 | |
404 | vsldoi v3,v3,zeroes,4 | |
405 | vsldoi v4,v4,zeroes,4 | |
406 | vsldoi v5,v5,zeroes,4 | |
407 | vsldoi v6,v6,zeroes,4 | |
408 | vsldoi v7,v7,zeroes,4 | |
409 | #endif | |
410 | ||
411 | /* xor with last 1024 bits */ | |
412 | lvx v8,0,r4 | |
413 | lvx v9,off16,r4 | |
414 | VPERM(v8,v8,v8,byteswap) | |
415 | VPERM(v9,v9,v9,byteswap) | |
416 | lvx v10,off32,r4 | |
417 | lvx v11,off48,r4 | |
418 | VPERM(v10,v10,v10,byteswap) | |
419 | VPERM(v11,v11,v11,byteswap) | |
420 | lvx v12,off64,r4 | |
421 | lvx v13,off80,r4 | |
422 | VPERM(v12,v12,v12,byteswap) | |
423 | VPERM(v13,v13,v13,byteswap) | |
424 | lvx v14,off96,r4 | |
425 | lvx v15,off112,r4 | |
426 | VPERM(v14,v14,v14,byteswap) | |
427 | VPERM(v15,v15,v15,byteswap) | |
428 | ||
429 | addi r4,r4,8*16 | |
430 | ||
431 | vxor v16,v0,v8 | |
432 | vxor v17,v1,v9 | |
433 | vxor v18,v2,v10 | |
434 | vxor v19,v3,v11 | |
435 | vxor v20,v4,v12 | |
436 | vxor v21,v5,v13 | |
437 | vxor v22,v6,v14 | |
438 | vxor v23,v7,v15 | |
439 | ||
440 | li r0,1 | |
441 | cmpdi r6,0 | |
442 | addi r6,r6,128 | |
443 | bne 1b | |
444 | ||
445 | /* Work out how many bytes we have left */ | |
446 | andi. r5,r5,127 | |
447 | ||
448 | /* Calculate where in the constant table we need to start */ | |
449 | subfic r6,r5,128 | |
450 | add r3,r3,r6 | |
451 | ||
452 | /* How many 16 byte chunks are in the tail */ | |
453 | srdi r7,r5,4 | |
454 | mtctr r7 | |
455 | ||
456 | /* | |
457 | * Reduce the previously calculated 1024 bits to 64 bits, shifting | |
458 | * 32 bits to include the trailing 32 bits of zeros | |
459 | */ | |
460 | lvx v0,0,r3 | |
461 | lvx v1,off16,r3 | |
462 | lvx v2,off32,r3 | |
463 | lvx v3,off48,r3 | |
464 | lvx v4,off64,r3 | |
465 | lvx v5,off80,r3 | |
466 | lvx v6,off96,r3 | |
467 | lvx v7,off112,r3 | |
468 | addi r3,r3,8*16 | |
469 | ||
470 | VPMSUMW(v0,v16,v0) | |
471 | VPMSUMW(v1,v17,v1) | |
472 | VPMSUMW(v2,v18,v2) | |
473 | VPMSUMW(v3,v19,v3) | |
474 | VPMSUMW(v4,v20,v4) | |
475 | VPMSUMW(v5,v21,v5) | |
476 | VPMSUMW(v6,v22,v6) | |
477 | VPMSUMW(v7,v23,v7) | |
478 | ||
479 | /* Now reduce the tail (0 - 112 bytes) */ | |
480 | cmpdi r7,0 | |
481 | beq 1f | |
482 | ||
483 | lvx v16,0,r4 | |
484 | lvx v17,0,r3 | |
485 | VPERM(v16,v16,v16,byteswap) | |
486 | VPMSUMW(v16,v16,v17) | |
487 | vxor v0,v0,v16 | |
488 | bdz 1f | |
489 | ||
490 | lvx v16,off16,r4 | |
491 | lvx v17,off16,r3 | |
492 | VPERM(v16,v16,v16,byteswap) | |
493 | VPMSUMW(v16,v16,v17) | |
494 | vxor v0,v0,v16 | |
495 | bdz 1f | |
496 | ||
497 | lvx v16,off32,r4 | |
498 | lvx v17,off32,r3 | |
499 | VPERM(v16,v16,v16,byteswap) | |
500 | VPMSUMW(v16,v16,v17) | |
501 | vxor v0,v0,v16 | |
502 | bdz 1f | |
503 | ||
504 | lvx v16,off48,r4 | |
505 | lvx v17,off48,r3 | |
506 | VPERM(v16,v16,v16,byteswap) | |
507 | VPMSUMW(v16,v16,v17) | |
508 | vxor v0,v0,v16 | |
509 | bdz 1f | |
510 | ||
511 | lvx v16,off64,r4 | |
512 | lvx v17,off64,r3 | |
513 | VPERM(v16,v16,v16,byteswap) | |
514 | VPMSUMW(v16,v16,v17) | |
515 | vxor v0,v0,v16 | |
516 | bdz 1f | |
517 | ||
518 | lvx v16,off80,r4 | |
519 | lvx v17,off80,r3 | |
520 | VPERM(v16,v16,v16,byteswap) | |
521 | VPMSUMW(v16,v16,v17) | |
522 | vxor v0,v0,v16 | |
523 | bdz 1f | |
524 | ||
525 | lvx v16,off96,r4 | |
526 | lvx v17,off96,r3 | |
527 | VPERM(v16,v16,v16,byteswap) | |
528 | VPMSUMW(v16,v16,v17) | |
529 | vxor v0,v0,v16 | |
530 | ||
531 | /* Now xor all the parallel chunks together */ | |
532 | 1: vxor v0,v0,v1 | |
533 | vxor v2,v2,v3 | |
534 | vxor v4,v4,v5 | |
535 | vxor v6,v6,v7 | |
536 | ||
537 | vxor v0,v0,v2 | |
538 | vxor v4,v4,v6 | |
539 | ||
540 | vxor v0,v0,v4 | |
541 | ||
542 | .Lbarrett_reduction: | |
543 | /* Barrett constants */ | |
544 | addis r3,r2,.barrett_constants@toc@ha | |
545 | addi r3,r3,.barrett_constants@toc@l | |
546 | ||
547 | lvx const1,0,r3 | |
548 | lvx const2,off16,r3 | |
549 | ||
550 | vsldoi v1,v0,v0,8 | |
551 | vxor v0,v0,v1 /* xor two 64 bit results together */ | |
552 | ||
553 | #ifdef REFLECT | |
554 | /* shift left one bit */ | |
555 | vspltisb v1,1 | |
556 | vsl v0,v0,v1 | |
557 | #endif | |
558 | ||
559 | vand v0,v0,mask_64bit | |
560 | ||
561 | #ifndef REFLECT | |
562 | /* | |
563 | * Now for the Barrett reduction algorithm. The idea is to calculate q, | |
564 | * the multiple of our polynomial that we need to subtract. By | |
565 | * doing the computation 2x bits higher (ie 64 bits) and shifting the | |
566 | * result back down 2x bits, we round down to the nearest multiple. | |
567 | */ | |
568 | VPMSUMD(v1,v0,const1) /* ma */ | |
569 | vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ | |
570 | VPMSUMD(v1,v1,const2) /* qn */ | |
571 | vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ | |
572 | ||
573 | /* | |
574 | * Get the result into r3. We need to shift it left 8 bytes: | |
575 | * V0 [ 0 1 2 X ] | |
576 | * V0 [ 0 X 2 3 ] | |
577 | */ | |
578 | vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ | |
579 | #else | |
580 | /* | |
581 | * The reflected version of Barrett reduction. Instead of bit | |
582 | * reflecting our data (which is expensive to do), we bit reflect our | |
583 | * constants and our algorithm, which means the intermediate data in | |
584 | * our vector registers goes from 0-63 instead of 63-0. We can reflect | |
585 | * the algorithm because we don't carry in mod 2 arithmetic. | |
586 | */ | |
587 | vand v1,v0,mask_32bit /* bottom 32 bits of a */ | |
588 | VPMSUMD(v1,v1,const1) /* ma */ | |
589 | vand v1,v1,mask_32bit /* bottom 32bits of ma */ | |
590 | VPMSUMD(v1,v1,const2) /* qn */ | |
591 | vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ | |
592 | ||
593 | /* | |
594 | * Since we are bit reflected, the result (ie the low 32 bits) is in | |
595 | * the high 32 bits. We just need to shift it left 4 bytes | |
596 | * V0 [ 0 1 X 3 ] | |
597 | * V0 [ 0 X 2 3 ] | |
598 | */ | |
599 | vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ | |
600 | #endif | |
601 | ||
602 | /* Get it into r3 */ | |
603 | MFVRD(r3, v0) | |
604 | ||
605 | .Lout: | |
606 | subi r6,r1,56+10*16 | |
607 | subi r7,r1,56+2*16 | |
608 | ||
609 | lvx v20,0,r6 | |
610 | lvx v21,off16,r6 | |
611 | lvx v22,off32,r6 | |
612 | lvx v23,off48,r6 | |
613 | lvx v24,off64,r6 | |
614 | lvx v25,off80,r6 | |
615 | lvx v26,off96,r6 | |
616 | lvx v27,off112,r6 | |
617 | lvx v28,0,r7 | |
618 | lvx v29,off16,r7 | |
619 | ||
620 | ld r31,-8(r1) | |
621 | ld r30,-16(r1) | |
622 | ld r29,-24(r1) | |
623 | ld r28,-32(r1) | |
624 | ld r27,-40(r1) | |
625 | ld r26,-48(r1) | |
626 | ld r25,-56(r1) | |
627 | ||
628 | blr | |
629 | ||
630 | .Lfirst_warm_up_done: | |
631 | lvx const1,0,r3 | |
632 | addi r3,r3,16 | |
633 | ||
634 | VPMSUMD(v8,v16,const1) | |
635 | VPMSUMD(v9,v17,const1) | |
636 | VPMSUMD(v10,v18,const1) | |
637 | VPMSUMD(v11,v19,const1) | |
638 | VPMSUMD(v12,v20,const1) | |
639 | VPMSUMD(v13,v21,const1) | |
640 | VPMSUMD(v14,v22,const1) | |
641 | VPMSUMD(v15,v23,const1) | |
642 | ||
643 | b .Lsecond_cool_down | |
644 | ||
645 | .Lshort: | |
646 | cmpdi r5,0 | |
647 | beq .Lzero | |
648 | ||
649 | addis r3,r2,.short_constants@toc@ha | |
650 | addi r3,r3,.short_constants@toc@l | |
651 | ||
652 | /* Calculate where in the constant table we need to start */ | |
653 | subfic r6,r5,256 | |
654 | add r3,r3,r6 | |
655 | ||
656 | /* How many 16 byte chunks? */ | |
657 | srdi r7,r5,4 | |
658 | mtctr r7 | |
659 | ||
660 | vxor v19,v19,v19 | |
661 | vxor v20,v20,v20 | |
662 | ||
663 | lvx v0,0,r4 | |
664 | lvx v16,0,r3 | |
665 | VPERM(v0,v0,v16,byteswap) | |
666 | vxor v0,v0,v8 /* xor in initial value */ | |
667 | VPMSUMW(v0,v0,v16) | |
668 | bdz .Lv0 | |
669 | ||
670 | lvx v1,off16,r4 | |
671 | lvx v17,off16,r3 | |
672 | VPERM(v1,v1,v17,byteswap) | |
673 | VPMSUMW(v1,v1,v17) | |
674 | bdz .Lv1 | |
675 | ||
676 | lvx v2,off32,r4 | |
677 | lvx v16,off32,r3 | |
678 | VPERM(v2,v2,v16,byteswap) | |
679 | VPMSUMW(v2,v2,v16) | |
680 | bdz .Lv2 | |
681 | ||
682 | lvx v3,off48,r4 | |
683 | lvx v17,off48,r3 | |
684 | VPERM(v3,v3,v17,byteswap) | |
685 | VPMSUMW(v3,v3,v17) | |
686 | bdz .Lv3 | |
687 | ||
688 | lvx v4,off64,r4 | |
689 | lvx v16,off64,r3 | |
690 | VPERM(v4,v4,v16,byteswap) | |
691 | VPMSUMW(v4,v4,v16) | |
692 | bdz .Lv4 | |
693 | ||
694 | lvx v5,off80,r4 | |
695 | lvx v17,off80,r3 | |
696 | VPERM(v5,v5,v17,byteswap) | |
697 | VPMSUMW(v5,v5,v17) | |
698 | bdz .Lv5 | |
699 | ||
700 | lvx v6,off96,r4 | |
701 | lvx v16,off96,r3 | |
702 | VPERM(v6,v6,v16,byteswap) | |
703 | VPMSUMW(v6,v6,v16) | |
704 | bdz .Lv6 | |
705 | ||
706 | lvx v7,off112,r4 | |
707 | lvx v17,off112,r3 | |
708 | VPERM(v7,v7,v17,byteswap) | |
709 | VPMSUMW(v7,v7,v17) | |
710 | bdz .Lv7 | |
711 | ||
712 | addi r3,r3,128 | |
713 | addi r4,r4,128 | |
714 | ||
715 | lvx v8,0,r4 | |
716 | lvx v16,0,r3 | |
717 | VPERM(v8,v8,v16,byteswap) | |
718 | VPMSUMW(v8,v8,v16) | |
719 | bdz .Lv8 | |
720 | ||
721 | lvx v9,off16,r4 | |
722 | lvx v17,off16,r3 | |
723 | VPERM(v9,v9,v17,byteswap) | |
724 | VPMSUMW(v9,v9,v17) | |
725 | bdz .Lv9 | |
726 | ||
727 | lvx v10,off32,r4 | |
728 | lvx v16,off32,r3 | |
729 | VPERM(v10,v10,v16,byteswap) | |
730 | VPMSUMW(v10,v10,v16) | |
731 | bdz .Lv10 | |
732 | ||
733 | lvx v11,off48,r4 | |
734 | lvx v17,off48,r3 | |
735 | VPERM(v11,v11,v17,byteswap) | |
736 | VPMSUMW(v11,v11,v17) | |
737 | bdz .Lv11 | |
738 | ||
739 | lvx v12,off64,r4 | |
740 | lvx v16,off64,r3 | |
741 | VPERM(v12,v12,v16,byteswap) | |
742 | VPMSUMW(v12,v12,v16) | |
743 | bdz .Lv12 | |
744 | ||
745 | lvx v13,off80,r4 | |
746 | lvx v17,off80,r3 | |
747 | VPERM(v13,v13,v17,byteswap) | |
748 | VPMSUMW(v13,v13,v17) | |
749 | bdz .Lv13 | |
750 | ||
751 | lvx v14,off96,r4 | |
752 | lvx v16,off96,r3 | |
753 | VPERM(v14,v14,v16,byteswap) | |
754 | VPMSUMW(v14,v14,v16) | |
755 | bdz .Lv14 | |
756 | ||
757 | lvx v15,off112,r4 | |
758 | lvx v17,off112,r3 | |
759 | VPERM(v15,v15,v17,byteswap) | |
760 | VPMSUMW(v15,v15,v17) | |
761 | ||
762 | .Lv15: vxor v19,v19,v15 | |
763 | .Lv14: vxor v20,v20,v14 | |
764 | .Lv13: vxor v19,v19,v13 | |
765 | .Lv12: vxor v20,v20,v12 | |
766 | .Lv11: vxor v19,v19,v11 | |
767 | .Lv10: vxor v20,v20,v10 | |
768 | .Lv9: vxor v19,v19,v9 | |
769 | .Lv8: vxor v20,v20,v8 | |
770 | .Lv7: vxor v19,v19,v7 | |
771 | .Lv6: vxor v20,v20,v6 | |
772 | .Lv5: vxor v19,v19,v5 | |
773 | .Lv4: vxor v20,v20,v4 | |
774 | .Lv3: vxor v19,v19,v3 | |
775 | .Lv2: vxor v20,v20,v2 | |
776 | .Lv1: vxor v19,v19,v1 | |
777 | .Lv0: vxor v20,v20,v0 | |
778 | ||
779 | vxor v0,v19,v20 | |
780 | ||
781 | b .Lbarrett_reduction | |
782 | ||
783 | .Lzero: | |
784 | mr r3,r10 | |
785 | b .Lout | |
786 | ||
20effc67 | 787 | FUNC_END(CRC32_FUNCTION_ASM) |