]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc64_jones_refl_by8.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / crc64_jones_refl_by8.asm
CommitLineData
224ce89b
WB
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31; Function API:
32; uint64_t crc64_jones_refl_by8(
33; uint64_t init_crc, //initial CRC value, 64 bits
34; const unsigned char *buf, //buffer pointer to calculate CRC on
35; uint64_t len //buffer length in bytes (64-bit data)
36; );
37;
38%include "reg_sizes.asm"
39
40%define fetch_dist 1024
41
42[bits 64]
43default rel
44
45section .text
46
47
48%ifidn __OUTPUT_FORMAT__, win64
49 %xdefine arg1 rcx
50 %xdefine arg2 rdx
51 %xdefine arg3 r8
52%else
53 %xdefine arg1 rdi
54 %xdefine arg2 rsi
55 %xdefine arg3 rdx
56%endif
57
58%define TMP 16*0
59%ifidn __OUTPUT_FORMAT__, win64
60 %define XMM_SAVE 16*2
61 %define VARIABLE_OFFSET 16*10+8
62%else
63 %define VARIABLE_OFFSET 16*2+8
64%endif
65
66
67align 16
f91f0fd5 68global crc64_jones_refl_by8:ISAL_SYM_TYPE_FUNCTION
224ce89b
WB
69crc64_jones_refl_by8:
70 ; uint64_t c = crc ^ 0xffffffff,ffffffffL;
71 not arg1
72 sub rsp, VARIABLE_OFFSET
73
74%ifidn __OUTPUT_FORMAT__, win64
75 ; push the xmm registers into the stack to maintain
76 movdqa [rsp + XMM_SAVE + 16*0], xmm6
77 movdqa [rsp + XMM_SAVE + 16*1], xmm7
78 movdqa [rsp + XMM_SAVE + 16*2], xmm8
79 movdqa [rsp + XMM_SAVE + 16*3], xmm9
80 movdqa [rsp + XMM_SAVE + 16*4], xmm10
81 movdqa [rsp + XMM_SAVE + 16*5], xmm11
82 movdqa [rsp + XMM_SAVE + 16*6], xmm12
83 movdqa [rsp + XMM_SAVE + 16*7], xmm13
84%endif
85
86 ; check if smaller than 256B
87 cmp arg3, 256
88
89 ; for sizes less than 256, we can't fold 128B at a time...
90 jl _less_than_256
91
92
93 ; load the initial crc value
94 movq xmm10, arg1 ; initial crc
95 ; receive the initial 128B data, xor the initial crc value
96 movdqu xmm0, [arg2+16*0]
97 movdqu xmm1, [arg2+16*1]
98 movdqu xmm2, [arg2+16*2]
99 movdqu xmm3, [arg2+16*3]
100 movdqu xmm4, [arg2+16*4]
101 movdqu xmm5, [arg2+16*5]
102 movdqu xmm6, [arg2+16*6]
103 movdqu xmm7, [arg2+16*7]
104
105 ; XOR the initial_crc value
106 pxor xmm0, xmm10
107 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
108 ;imm value of pclmulqdq instruction will determine which constant to use
109 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
110 ; we subtract 256 instead of 128 to save one instruction from the loop
111 sub arg3, 256
112
113 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
114 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
115
116
117 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
118_fold_128_B_loop:
119
120 ; update the buffer pointer
121 add arg2, 128
122
123 prefetchnta [arg2+fetch_dist+0]
124 movdqu xmm9, [arg2+16*0]
125 movdqu xmm12, [arg2+16*1]
126 movdqa xmm8, xmm0
127 movdqa xmm13, xmm1
128 pclmulqdq xmm0, xmm10, 0x10
129 pclmulqdq xmm8, xmm10 , 0x1
130 pclmulqdq xmm1, xmm10, 0x10
131 pclmulqdq xmm13, xmm10 , 0x1
132 pxor xmm0, xmm9
133 xorps xmm0, xmm8
134 pxor xmm1, xmm12
135 xorps xmm1, xmm13
136
137 prefetchnta [arg2+fetch_dist+32]
138 movdqu xmm9, [arg2+16*2]
139 movdqu xmm12, [arg2+16*3]
140 movdqa xmm8, xmm2
141 movdqa xmm13, xmm3
142 pclmulqdq xmm2, xmm10, 0x10
143 pclmulqdq xmm8, xmm10 , 0x1
144 pclmulqdq xmm3, xmm10, 0x10
145 pclmulqdq xmm13, xmm10 , 0x1
146 pxor xmm2, xmm9
147 xorps xmm2, xmm8
148 pxor xmm3, xmm12
149 xorps xmm3, xmm13
150
151 prefetchnta [arg2+fetch_dist+64]
152 movdqu xmm9, [arg2+16*4]
153 movdqu xmm12, [arg2+16*5]
154 movdqa xmm8, xmm4
155 movdqa xmm13, xmm5
156 pclmulqdq xmm4, xmm10, 0x10
157 pclmulqdq xmm8, xmm10 , 0x1
158 pclmulqdq xmm5, xmm10, 0x10
159 pclmulqdq xmm13, xmm10 , 0x1
160 pxor xmm4, xmm9
161 xorps xmm4, xmm8
162 pxor xmm5, xmm12
163 xorps xmm5, xmm13
164
165 prefetchnta [arg2+fetch_dist+96]
166 movdqu xmm9, [arg2+16*6]
167 movdqu xmm12, [arg2+16*7]
168 movdqa xmm8, xmm6
169 movdqa xmm13, xmm7
170 pclmulqdq xmm6, xmm10, 0x10
171 pclmulqdq xmm8, xmm10 , 0x1
172 pclmulqdq xmm7, xmm10, 0x10
173 pclmulqdq xmm13, xmm10 , 0x1
174 pxor xmm6, xmm9
175 xorps xmm6, xmm8
176 pxor xmm7, xmm12
177 xorps xmm7, xmm13
178
179 sub arg3, 128
180
181 ; check if there is another 128B in the buffer to be able to fold
182 jge _fold_128_B_loop
183 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
184
185 add arg2, 128
186 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
187 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
188
189
190 ; fold the 8 xmm registers to 1 xmm register with different constants
191 ; xmm0 to xmm7
192 movdqa xmm10, [rk9]
193 movdqa xmm8, xmm0
194 pclmulqdq xmm0, xmm10, 0x1
195 pclmulqdq xmm8, xmm10, 0x10
196 pxor xmm7, xmm8
197 xorps xmm7, xmm0
198 ;xmm1 to xmm7
199 movdqa xmm10, [rk11]
200 movdqa xmm8, xmm1
201 pclmulqdq xmm1, xmm10, 0x1
202 pclmulqdq xmm8, xmm10, 0x10
203 pxor xmm7, xmm8
204 xorps xmm7, xmm1
205
206 movdqa xmm10, [rk13]
207 movdqa xmm8, xmm2
208 pclmulqdq xmm2, xmm10, 0x1
209 pclmulqdq xmm8, xmm10, 0x10
210 pxor xmm7, xmm8
211 pxor xmm7, xmm2
212
213 movdqa xmm10, [rk15]
214 movdqa xmm8, xmm3
215 pclmulqdq xmm3, xmm10, 0x1
216 pclmulqdq xmm8, xmm10, 0x10
217 pxor xmm7, xmm8
218 xorps xmm7, xmm3
219
220 movdqa xmm10, [rk17]
221 movdqa xmm8, xmm4
222 pclmulqdq xmm4, xmm10, 0x1
223 pclmulqdq xmm8, xmm10, 0x10
224 pxor xmm7, xmm8
225 pxor xmm7, xmm4
226
227 movdqa xmm10, [rk19]
228 movdqa xmm8, xmm5
229 pclmulqdq xmm5, xmm10, 0x1
230 pclmulqdq xmm8, xmm10, 0x10
231 pxor xmm7, xmm8
232 xorps xmm7, xmm5
233 ; xmm6 to xmm7
234 movdqa xmm10, [rk1]
235 movdqa xmm8, xmm6
236 pclmulqdq xmm6, xmm10, 0x1
237 pclmulqdq xmm8, xmm10, 0x10
238 pxor xmm7, xmm8
239 pxor xmm7, xmm6
240
241
242 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
243 ; instead of a cmp instruction, we use the negative flag with the jl instruction
244 add arg3, 128-16
245 jl _final_reduction_for_128
246
247 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
248 ; we can fold 16 bytes at a time if y>=16
249 ; continue folding 16B at a time
250
251_16B_reduction_loop:
252 movdqa xmm8, xmm7
253 pclmulqdq xmm7, xmm10, 0x1
254 pclmulqdq xmm8, xmm10, 0x10
255 pxor xmm7, xmm8
256 movdqu xmm0, [arg2]
257 pxor xmm7, xmm0
258 add arg2, 16
259 sub arg3, 16
260 ; instead of a cmp instruction, we utilize the flags with the jge instruction
261 ; equivalent of: cmp arg3, 16-16
262 ; check if there is any more 16B in the buffer to be able to fold
263 jge _16B_reduction_loop
264
265 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
266 ;first, we reduce the data in the xmm7 register
267
268
269_final_reduction_for_128:
270 add arg3, 16
271 je _128_done
272 ; here we are getting data that is less than 16 bytes.
273 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
274 ; after that the registers need to be adjusted.
275_get_last_two_xmms:
276
277
278 movdqa xmm2, xmm7
279 movdqu xmm1, [arg2 - 16 + arg3]
280
281 ; get rid of the extra data that was loaded before
282 ; load the shift constant
283 lea rax, [pshufb_shf_table]
284 add rax, arg3
285 movdqu xmm0, [rax]
286
287
288 pshufb xmm7, xmm0
289 pxor xmm0, [mask3]
290 pshufb xmm2, xmm0
291
292 pblendvb xmm2, xmm1 ;xmm0 is implicit
293 ;;;;;;;;;;
294 movdqa xmm8, xmm7
295 pclmulqdq xmm7, xmm10, 0x1
296
297 pclmulqdq xmm8, xmm10, 0x10
298 pxor xmm7, xmm8
299 pxor xmm7, xmm2
300
301_128_done:
302 ; compute crc of a 128-bit value
303 movdqa xmm10, [rk5]
304 movdqa xmm0, xmm7
305
306 ;64b fold
307 pclmulqdq xmm7, xmm10, 0
308 psrldq xmm0, 8
309 pxor xmm7, xmm0
310
311 ;barrett reduction
312_barrett:
313 movdqa xmm1, xmm7
314 movdqa xmm10, [rk7]
315
316 pclmulqdq xmm7, xmm10, 0
317 movdqa xmm2, xmm7
318 pclmulqdq xmm7, xmm10, 0x10
319 pslldq xmm2, 8
320 pxor xmm7, xmm2
321 pxor xmm7, xmm1
322 pextrq rax, xmm7, 1
323
324_cleanup:
325 ; return c ^ 0xffffffff, ffffffffL;
326 not rax
327
328
329%ifidn __OUTPUT_FORMAT__, win64
330 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
331 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
332 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
333 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
334 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
335 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
336 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
337 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
338%endif
339 add rsp, VARIABLE_OFFSET
340 ret
341
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
344;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
345;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
346
347align 16
348_less_than_256:
349
350 ; check if there is enough buffer to be able to fold 16B at a time
351 cmp arg3, 32
352 jl _less_than_32
353
354 ; if there is, load the constants
355 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
356
357 movq xmm0, arg1 ; get the initial crc value
358 movdqu xmm7, [arg2] ; load the plaintext
359 pxor xmm7, xmm0
360
361 ; update the buffer pointer
362 add arg2, 16
363
364 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
365 sub arg3, 32
366
367 jmp _16B_reduction_loop
368
369align 16
370_less_than_32:
371 ; mov initial crc to the return value. this is necessary for zero-length buffers.
372 mov rax, arg1
373 test arg3, arg3
374 je _cleanup
375
376 movq xmm0, arg1 ; get the initial crc value
377
378 cmp arg3, 16
379 je _exact_16_left
380 jl _less_than_16_left
381
382 movdqu xmm7, [arg2] ; load the plaintext
383 pxor xmm7, xmm0 ; xor the initial crc value
384 add arg2, 16
385 sub arg3, 16
386 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
387 jmp _get_last_two_xmms
388
389
390align 16
391_less_than_16_left:
392 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
393
394 pxor xmm1, xmm1
395 mov r11, rsp
396 movdqa [r11], xmm1
397
398 ; backup the counter value
399 mov r9, arg3
400 cmp arg3, 8
401 jl _less_than_8_left
402
403 ; load 8 Bytes
404 mov rax, [arg2]
405 mov [r11], rax
406 add r11, 8
407 sub arg3, 8
408 add arg2, 8
409_less_than_8_left:
410
411 cmp arg3, 4
412 jl _less_than_4_left
413
414 ; load 4 Bytes
415 mov eax, [arg2]
416 mov [r11], eax
417 add r11, 4
418 sub arg3, 4
419 add arg2, 4
420_less_than_4_left:
421
422 cmp arg3, 2
423 jl _less_than_2_left
424
425 ; load 2 Bytes
426 mov ax, [arg2]
427 mov [r11], ax
428 add r11, 2
429 sub arg3, 2
430 add arg2, 2
431_less_than_2_left:
432 cmp arg3, 1
433 jl _zero_left
434
435 ; load 1 Byte
436 mov al, [arg2]
437 mov [r11], al
438
439_zero_left:
440 movdqa xmm7, [rsp]
441 pxor xmm7, xmm0 ; xor the initial crc value
442
443 lea rax,[pshufb_shf_table]
444
445 cmp r9, 8
446 jl _end_1to7
447
448_end_8to15:
449 movdqu xmm0, [rax + r9]
450 pshufb xmm7,xmm0
451 jmp _128_done
452
453_end_1to7:
454 ; Left shift (8-length) bytes in XMM
455 movdqu xmm0, [rax + r9 + 8]
456 pshufb xmm7,xmm0
457
458 jmp _barrett
459
460align 16
461_exact_16_left:
462 movdqu xmm7, [arg2]
463 pxor xmm7, xmm0 ; xor the initial crc value
464
465 jmp _128_done
466
467section .data
468
469; precomputed constants
470align 16
471; rk7 = floor(2^128/Q)
472; rk8 = Q
473rk1:
474DQ 0x381d0015c96f4444
475rk2:
476DQ 0xd9d7be7d505da32c
477rk3:
478DQ 0x768361524d29ed0b
479rk4:
480DQ 0xcc26fa7c57f8054c
481rk5:
482DQ 0x381d0015c96f4444
483rk6:
484DQ 0x0000000000000000
485rk7:
486DQ 0x3e6cfa329aef9f77
487rk8:
488DQ 0x2b5926535897936a
489rk9:
490DQ 0x5bc94ba8e2087636
491rk10:
492DQ 0x6cf09c8f37710b75
493rk11:
494DQ 0x3885fd59e440d95a
495rk12:
496DQ 0xbccba3936411fb7e
497rk13:
498DQ 0xe4dd0d81cbfce585
499rk14:
500DQ 0xb715e37b96ed8633
501rk15:
502DQ 0xf49784a634f014e4
503rk16:
504DQ 0xaf86efb16d9ab4fb
505rk17:
506DQ 0x7b3211a760160db8
507rk18:
508DQ 0xa062b2319d66692f
509rk19:
510DQ 0xef3d1d18ed889ed2
511rk20:
512DQ 0x6ba4d760ab38201e
513
514pshufb_shf_table:
515; use these values for shift constants for the pshufb instruction
516; different alignments result in values as shown:
517; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
518; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
519; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
520; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
521; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
522; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
523; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
524; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
525; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
526; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
527; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
528; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
529; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
530; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
531; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
532dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
533dq 0x0706050403020100, 0x000e0d0c0b0a0908
534
535
536mask:
537dq 0xFFFFFFFFFFFFFFFF, 0x0000000000000000
538mask2:
539dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
540mask3:
541dq 0x8080808080808080, 0x8080808080808080
542
543;;; func core, ver, snum
544slversion crc64_jones_refl_by8, 01, 00, 0029