]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc32_gzip_refl_by8_02.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / crc32_gzip_refl_by8_02.asm
CommitLineData
f91f0fd5
TL
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31; Function API:
32; UINT32 crc32_gzip_refl_by8_02(
33; UINT32 init_crc, //initial CRC value, 32 bits
34; const unsigned char *buf, //buffer pointer to calculate CRC on
35; UINT64 len //buffer length in bytes (64-bit data)
36; );
37;
38; Authors:
39; Erdinc Ozturk
40; Vinodh Gopal
41; James Guilford
42;
43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44; URL: http://download.intel.com/design/intarch/papers/323102.pdf
45;
46;
47; sample yasm command line:
48; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
49;
50; As explained here:
51; http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
52; CRC-32 checksum is described in RFC 1952
53; Implementing RFC 1952 CRC:
54; http://www.ietf.org/rfc/rfc1952.txt
55
56%include "reg_sizes.asm"
57
58%define fetch_dist 1024
59
60[bits 64]
61default rel
62
63section .text
64
65
66%ifidn __OUTPUT_FORMAT__, win64
67 %xdefine arg1 rcx
68 %xdefine arg2 rdx
69 %xdefine arg3 r8
70
71 %xdefine arg1_low32 ecx
72%else
73 %xdefine arg1 rdi
74 %xdefine arg2 rsi
75 %xdefine arg3 rdx
76
77 %xdefine arg1_low32 edi
78%endif
79
80%define TMP 16*0
81%ifidn __OUTPUT_FORMAT__, win64
82 %define XMM_SAVE 16*2
83 %define VARIABLE_OFFSET 16*10+8
84%else
85 %define VARIABLE_OFFSET 16*2+8
86%endif
87
88align 16
20effc67 89mk_global crc32_gzip_refl_by8_02, function
f91f0fd5 90crc32_gzip_refl_by8_02:
20effc67 91 endbranch
f91f0fd5
TL
92 not arg1_low32
93 sub rsp, VARIABLE_OFFSET
94
95%ifidn __OUTPUT_FORMAT__, win64
96 ; push the xmm registers into the stack to maintain
97 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
98 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
99 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
100 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
101 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
102 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
103 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
104 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
105%endif
106
107 ; check if smaller than 256B
108 cmp arg3, 256
109 jl .less_than_256
110
111 ; load the initial crc value
112 vmovd xmm10, arg1_low32 ; initial crc
113
114 ; receive the initial 64B data, xor the initial crc value
115 vmovdqu xmm0, [arg2+16*0]
116 vmovdqu xmm1, [arg2+16*1]
117 vmovdqu xmm2, [arg2+16*2]
118 vmovdqu xmm3, [arg2+16*3]
119 vmovdqu xmm4, [arg2+16*4]
120 vmovdqu xmm5, [arg2+16*5]
121 vmovdqu xmm6, [arg2+16*6]
122 vmovdqu xmm7, [arg2+16*7]
123
124 ; XOR the initial_crc value
125 vpxor xmm0, xmm10
126 vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
127 ;imm value of pclmulqdq instruction will determine which constant to use
128 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129 ; we subtract 256 instead of 128 to save one instruction from the loop
130 sub arg3, 256
131
132 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
133 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
134
135 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
136.fold_128_B_loop:
137 add arg2, 128
138 prefetchnta [arg2+fetch_dist+0]
139 vmovdqu xmm9, [arg2+16*0]
140 vmovdqu xmm12, [arg2+16*1]
141 vpclmulqdq xmm8, xmm0, xmm10, 0x10
142 vpclmulqdq xmm0, xmm0, xmm10 , 0x1
143 vpclmulqdq xmm13, xmm1, xmm10, 0x10
144 vpclmulqdq xmm1, xmm1, xmm10 , 0x1
145 vpxor xmm0, xmm9
146 vxorps xmm0, xmm8
147 vpxor xmm1, xmm12
148 vxorps xmm1, xmm13
149
150 prefetchnta [arg2+fetch_dist+32]
151 vmovdqu xmm9, [arg2+16*2]
152 vmovdqu xmm12, [arg2+16*3]
153 vpclmulqdq xmm8, xmm2, xmm10, 0x10
154 vpclmulqdq xmm2, xmm2, xmm10 , 0x1
155 vpclmulqdq xmm13, xmm3, xmm10, 0x10
156 vpclmulqdq xmm3, xmm3, xmm10 , 0x1
157 vpxor xmm2, xmm9
158 vxorps xmm2, xmm8
159 vpxor xmm3, xmm12
160 vxorps xmm3, xmm13
161
162 prefetchnta [arg2+fetch_dist+64]
163 vmovdqu xmm9, [arg2+16*4]
164 vmovdqu xmm12, [arg2+16*5]
165 vpclmulqdq xmm8, xmm4, xmm10, 0x10
166 vpclmulqdq xmm4, xmm4, xmm10 , 0x1
167 vpclmulqdq xmm13, xmm5, xmm10, 0x10
168 vpclmulqdq xmm5, xmm5, xmm10 , 0x1
169 vpxor xmm4, xmm9
170 vxorps xmm4, xmm8
171 vpxor xmm5, xmm12
172 vxorps xmm5, xmm13
173
174 prefetchnta [arg2+fetch_dist+96]
175 vmovdqu xmm9, [arg2+16*6]
176 vmovdqu xmm12, [arg2+16*7]
177 vpclmulqdq xmm8, xmm6, xmm10, 0x10
178 vpclmulqdq xmm6, xmm6, xmm10 , 0x1
179 vpclmulqdq xmm13, xmm7, xmm10, 0x10
180 vpclmulqdq xmm7, xmm7, xmm10 , 0x1
181 vpxor xmm6, xmm9
182 vxorps xmm6, xmm8
183 vpxor xmm7, xmm12
184 vxorps xmm7, xmm13
185
186 sub arg3, 128
187 jge .fold_128_B_loop
188 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
189
190 add arg2, 128
191 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
192 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
193
194 ; fold the 8 xmm registers to 1 xmm register with different constants
195 vmovdqa xmm10, [rk9]
196 vpclmulqdq xmm8, xmm0, xmm10, 0x1
197 vpclmulqdq xmm0, xmm0, xmm10, 0x10
198 vpxor xmm7, xmm8
199 vxorps xmm7, xmm0
200
201 vmovdqa xmm10, [rk11]
202 vpclmulqdq xmm8, xmm1, xmm10, 0x1
203 vpclmulqdq xmm1, xmm1, xmm10, 0x10
204 vpxor xmm7, xmm8
205 vxorps xmm7, xmm1
206
207 vmovdqa xmm10, [rk13]
208 vpclmulqdq xmm8, xmm2, xmm10, 0x1
209 vpclmulqdq xmm2, xmm2, xmm10, 0x10
210 vpxor xmm7, xmm8
211 vpxor xmm7, xmm2
212
213 vmovdqa xmm10, [rk15]
214 vpclmulqdq xmm8, xmm3, xmm10, 0x1
215 vpclmulqdq xmm3, xmm3, xmm10, 0x10
216 vpxor xmm7, xmm8
217 vxorps xmm7, xmm3
218
219 vmovdqa xmm10, [rk17]
220 vpclmulqdq xmm8, xmm4, xmm10, 0x1
221 vpclmulqdq xmm4, xmm4, xmm10, 0x10
222 vpxor xmm7, xmm8
223 vpxor xmm7, xmm4
224
225 vmovdqa xmm10, [rk19]
226 vpclmulqdq xmm8, xmm5, xmm10, 0x1
227 vpclmulqdq xmm5, xmm5, xmm10, 0x10
228 vpxor xmm7, xmm8
229 vxorps xmm7, xmm5
230
231 vmovdqa xmm10, [rk1]
232 vpclmulqdq xmm8, xmm6, xmm10, 0x1
233 vpclmulqdq xmm6, xmm6, xmm10, 0x10
234 vpxor xmm7, xmm8
235 vpxor xmm7, xmm6
236
237
238 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
239 ; instead of a cmp instruction, we use the negative flag with the jl instruction
240 add arg3, 128-16
241 jl .final_reduction_for_128
242
243 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
244 ; we can fold 16 bytes at a time if y>=16
245 ; continue folding 16B at a time
246
247.16B_reduction_loop:
248 vpclmulqdq xmm8, xmm7, xmm10, 0x1
249 vpclmulqdq xmm7, xmm7, xmm10, 0x10
250 vpxor xmm7, xmm8
251 vmovdqu xmm0, [arg2]
252 vpxor xmm7, xmm0
253 add arg2, 16
254 sub arg3, 16
255 ; instead of a cmp instruction, we utilize the flags with the jge instruction
256 ; equivalent of: cmp arg3, 16-16
257 ; check if there is any more 16B in the buffer to be able to fold
258 jge .16B_reduction_loop
259
260 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
261 ;first, we reduce the data in the xmm7 register
262
263
264.final_reduction_for_128:
265 add arg3, 16
266 je .128_done
267
268 ; here we are getting data that is less than 16 bytes.
269 ; since we know that there was data before the pointer, we can offset
270 ; the input pointer before the actual point, to receive exactly 16 bytes.
271 ; after that the registers need to be adjusted.
272.get_last_two_xmms:
273
274 vmovdqa xmm2, xmm7
275 vmovdqu xmm1, [arg2 - 16 + arg3]
276
277 ; get rid of the extra data that was loaded before
278 ; load the shift constant
279 lea rax, [pshufb_shf_table]
280 add rax, arg3
281 vmovdqu xmm0, [rax]
282
283 vpshufb xmm7, xmm0
284 vpxor xmm0, [mask3]
285 vpshufb xmm2, xmm0
286
287 vpblendvb xmm2, xmm2, xmm1, xmm0
288 ;;;;;;;;;;
289 vpclmulqdq xmm8, xmm7, xmm10, 0x1
290 vpclmulqdq xmm7, xmm7, xmm10, 0x10
291 vpxor xmm7, xmm8
292 vpxor xmm7, xmm2
293
294.128_done:
295 ; compute crc of a 128-bit value
296 vmovdqa xmm10, [rk5]
297 vmovdqa xmm0, xmm7
298
299 ;64b fold
300 vpclmulqdq xmm7, xmm10, 0
301 vpsrldq xmm0, 8
302 vpxor xmm7, xmm0
303
304 ;32b fold
305 vmovdqa xmm0, xmm7
306 vpslldq xmm7, 4
307 vpclmulqdq xmm7, xmm10, 0x10
308 vpxor xmm7, xmm0
309
310
311 ;barrett reduction
312.barrett:
313 vpand xmm7, [mask2]
314 vmovdqa xmm1, xmm7
315 vmovdqa xmm2, xmm7
316 vmovdqa xmm10, [rk7]
317
318 vpclmulqdq xmm7, xmm10, 0
319 vpxor xmm7, xmm2
320 vpand xmm7, [mask]
321 vmovdqa xmm2, xmm7
322 vpclmulqdq xmm7, xmm10, 0x10
323 vpxor xmm7, xmm2
324 vpxor xmm7, xmm1
325 vpextrd eax, xmm7, 2
326
327.cleanup:
328 not eax
329
330
331%ifidn __OUTPUT_FORMAT__, win64
332 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
333 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
334 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
335 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
336 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
337 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
338 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
339 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
340%endif
341 add rsp, VARIABLE_OFFSET
342 ret
343
344
345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
347;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
348;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
349
350align 16
351.less_than_256:
352
353 ; check if there is enough buffer to be able to fold 16B at a time
354 cmp arg3, 32
355 jl .less_than_32
356
357 ; if there is, load the constants
358 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
359
360 vmovd xmm0, arg1_low32 ; get the initial crc value
361 vmovdqu xmm7, [arg2] ; load the plaintext
362 vpxor xmm7, xmm0
363
364 ; update the buffer pointer
365 add arg2, 16
366
367 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
368 sub arg3, 32
369
370 jmp .16B_reduction_loop
371
372
373align 16
374.less_than_32:
375 ; mov initial crc to the return value. this is necessary for zero-length buffers.
376 mov eax, arg1_low32
377 test arg3, arg3
378 je .cleanup
379
380 vmovd xmm0, arg1_low32 ; get the initial crc value
381
382 cmp arg3, 16
383 je .exact_16_left
384 jl .less_than_16_left
385
386 vmovdqu xmm7, [arg2] ; load the plaintext
387 vpxor xmm7, xmm0 ; xor the initial crc value
388 add arg2, 16
389 sub arg3, 16
390 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
391 jmp .get_last_two_xmms
392
393align 16
394.less_than_16_left:
395 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
396
397 vpxor xmm1, xmm1
398 mov r11, rsp
399 vmovdqa [r11], xmm1
400
401 cmp arg3, 4
402 jl .only_less_than_4
403
404 ; backup the counter value
405 mov r9, arg3
406 cmp arg3, 8
407 jl .less_than_8_left
408
409 ; load 8 Bytes
410 mov rax, [arg2]
411 mov [r11], rax
412 add r11, 8
413 sub arg3, 8
414 add arg2, 8
415.less_than_8_left:
416
417 cmp arg3, 4
418 jl .less_than_4_left
419
420 ; load 4 Bytes
421 mov eax, [arg2]
422 mov [r11], eax
423 add r11, 4
424 sub arg3, 4
425 add arg2, 4
426.less_than_4_left:
427
428 cmp arg3, 2
429 jl .less_than_2_left
430
431 ; load 2 Bytes
432 mov ax, [arg2]
433 mov [r11], ax
434 add r11, 2
435 sub arg3, 2
436 add arg2, 2
437.less_than_2_left:
438 cmp arg3, 1
439 jl .zero_left
440
441 ; load 1 Byte
442 mov al, [arg2]
443 mov [r11], al
444
445.zero_left:
446 vmovdqa xmm7, [rsp]
447 vpxor xmm7, xmm0 ; xor the initial crc value
448
449 lea rax,[pshufb_shf_table]
450 vmovdqu xmm0, [rax + r9]
451 vpshufb xmm7,xmm0
452 jmp .128_done
453
454align 16
455.exact_16_left:
456 vmovdqu xmm7, [arg2]
457 vpxor xmm7, xmm0 ; xor the initial crc value
458 jmp .128_done
459
460.only_less_than_4:
461 cmp arg3, 3
462 jl .only_less_than_3
463
464 ; load 3 Bytes
465 mov al, [arg2]
466 mov [r11], al
467
468 mov al, [arg2+1]
469 mov [r11+1], al
470
471 mov al, [arg2+2]
472 mov [r11+2], al
473
474 vmovdqa xmm7, [rsp]
475 vpxor xmm7, xmm0 ; xor the initial crc value
476
477 vpslldq xmm7, 5
478 jmp .barrett
479
480.only_less_than_3:
481 cmp arg3, 2
482 jl .only_less_than_2
483
484 ; load 2 Bytes
485 mov al, [arg2]
486 mov [r11], al
487
488 mov al, [arg2+1]
489 mov [r11+1], al
490
491 vmovdqa xmm7, [rsp]
492 vpxor xmm7, xmm0 ; xor the initial crc value
493
494 vpslldq xmm7, 6
495 jmp .barrett
496
497.only_less_than_2:
498 ; load 1 Byte
499 mov al, [arg2]
500 mov [r11], al
501
502 vmovdqa xmm7, [rsp]
503 vpxor xmm7, xmm0 ; xor the initial crc value
504
505 vpslldq xmm7, 7
506 jmp .barrett
507
508section .data
509
510; precomputed constants
511align 16
512rk1: dq 0x00000000ccaa009e
513rk2: dq 0x00000001751997d0
514rk3: dq 0x000000014a7fe880
515rk4: dq 0x00000001e88ef372
516rk5: dq 0x00000000ccaa009e
517rk6: dq 0x0000000163cd6124
518rk7: dq 0x00000001f7011640
519rk8: dq 0x00000001db710640
520rk9: dq 0x00000001d7cfc6ac
521rk10: dq 0x00000001ea89367e
522rk11: dq 0x000000018cb44e58
523rk12: dq 0x00000000df068dc2
524rk13: dq 0x00000000ae0b5394
525rk14: dq 0x00000001c7569e54
526rk15: dq 0x00000001c6e41596
527rk16: dq 0x0000000154442bd4
528rk17: dq 0x0000000174359406
529rk18: dq 0x000000003db1ecdc
530rk19: dq 0x000000015a546366
531rk20: dq 0x00000000f1da05aa
532
533mask: dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
534mask2: dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
535mask3: dq 0x8080808080808080, 0x8080808080808080
536
537pshufb_shf_table:
538; use these values for shift constants for the pshufb instruction
539; different alignments result in values as shown:
540; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
541; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
542; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
543; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
544; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
545; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
546; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
547; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
548; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
549; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
550; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
551; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
552; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
553; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
554; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
555dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
556dq 0x0706050403020100, 0x000e0d0c0b0a0908