]> git.proxmox.com Git - ceph.git/blame - ceph/src/isa-l/crc/crc64_ecma_norm_by8.asm
import quincy beta 17.1.0
[ceph.git] / ceph / src / isa-l / crc / crc64_ecma_norm_by8.asm
CommitLineData
224ce89b
WB
1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
3;
4; Redistribution and use in source and binary forms, with or without
5; modification, are permitted provided that the following conditions
6; are met:
7; * Redistributions of source code must retain the above copyright
8; notice, this list of conditions and the following disclaimer.
9; * Redistributions in binary form must reproduce the above copyright
10; notice, this list of conditions and the following disclaimer in
11; the documentation and/or other materials provided with the
12; distribution.
13; * Neither the name of Intel Corporation nor the names of its
14; contributors may be used to endorse or promote products derived
15; from this software without specific prior written permission.
16;
17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30; Function API:
31; uint64_t crc64_ecma_norm_by8(
32; uint64_t init_crc, //initial CRC value, 64 bits
33; const unsigned char *buf, //buffer pointer to calculate CRC on
34; uint64_t len //buffer length in bytes (64-bit data)
35; );
36;
37; yasm -f x64 -f elf64 -X gnu -g dwarf2 crc64_ecma_norm_by8
38%include "reg_sizes.asm"
39
40%define fetch_dist 1024
41
42[bits 64]
43default rel
44
45section .text
46
47%ifidn __OUTPUT_FORMAT__, win64
48 %xdefine arg1 rcx
49 %xdefine arg2 rdx
50 %xdefine arg3 r8
51%else
52 %xdefine arg1 rdi
53 %xdefine arg2 rsi
54 %xdefine arg3 rdx
55%endif
56
57%define TMP 16*0
58%ifidn __OUTPUT_FORMAT__, win64
59 %define XMM_SAVE 16*2
60 %define VARIABLE_OFFSET 16*10+8
61%else
62 %define VARIABLE_OFFSET 16*2+8
63%endif
64align 16
20effc67 65mk_global crc64_ecma_norm_by8, function
224ce89b 66crc64_ecma_norm_by8:
20effc67 67 endbranch
224ce89b
WB
68
69 not arg1 ;~init_crc
70
71 sub rsp,VARIABLE_OFFSET
72
73%ifidn __OUTPUT_FORMAT__, win64
74 ; push the xmm registers into the stack to maintain
75 movdqa [rsp + XMM_SAVE + 16*0], xmm6
76 movdqa [rsp + XMM_SAVE + 16*1], xmm7
77 movdqa [rsp + XMM_SAVE + 16*2], xmm8
78 movdqa [rsp + XMM_SAVE + 16*3], xmm9
79 movdqa [rsp + XMM_SAVE + 16*4], xmm10
80 movdqa [rsp + XMM_SAVE + 16*5], xmm11
81 movdqa [rsp + XMM_SAVE + 16*6], xmm12
82 movdqa [rsp + XMM_SAVE + 16*7], xmm13
83%endif
84
85
86 ; check if smaller than 256
87 cmp arg3, 256
88
89 ; for sizes less than 256, we can't fold 128B at a time...
90 jl _less_than_256
91
92
93 ; load the initial crc value
94 movq xmm10, arg1 ; initial crc
95
96 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
97 ; because data will be byte-reflected and will align with initial crc at correct place.
98 pslldq xmm10, 8
99
100 movdqa xmm11, [SHUF_MASK]
101 ; receive the initial 128B data, xor the initial crc value
102 movdqu xmm0, [arg2+16*0]
103 movdqu xmm1, [arg2+16*1]
104 movdqu xmm2, [arg2+16*2]
105 movdqu xmm3, [arg2+16*3]
106 movdqu xmm4, [arg2+16*4]
107 movdqu xmm5, [arg2+16*5]
108 movdqu xmm6, [arg2+16*6]
109 movdqu xmm7, [arg2+16*7]
110
111 pshufb xmm0, xmm11
112 ; XOR the initial_crc value
113 pxor xmm0, xmm10
114 pshufb xmm1, xmm11
115 pshufb xmm2, xmm11
116 pshufb xmm3, xmm11
117 pshufb xmm4, xmm11
118 pshufb xmm5, xmm11
119 pshufb xmm6, xmm11
120 pshufb xmm7, xmm11
121
122 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
123 ;imm value of pclmulqdq instruction will determine which constant to use
124 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125 ; we subtract 256 instead of 128 to save one instruction from the loop
126 sub arg3, 256
127
128 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
129 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
130
131
132 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
133_fold_128_B_loop:
134
135 ; update the buffer pointer
136 add arg2, 128 ; buf += 128;
137
138 prefetchnta [arg2+fetch_dist+0]
139 movdqu xmm9, [arg2+16*0]
140 movdqu xmm12, [arg2+16*1]
141 pshufb xmm9, xmm11
142 pshufb xmm12, xmm11
143 movdqa xmm8, xmm0
144 movdqa xmm13, xmm1
145 pclmulqdq xmm0, xmm10, 0x0
146 pclmulqdq xmm8, xmm10 , 0x11
147 pclmulqdq xmm1, xmm10, 0x0
148 pclmulqdq xmm13, xmm10 , 0x11
149 pxor xmm0, xmm9
150 xorps xmm0, xmm8
151 pxor xmm1, xmm12
152 xorps xmm1, xmm13
153
154 prefetchnta [arg2+fetch_dist+32]
155 movdqu xmm9, [arg2+16*2]
156 movdqu xmm12, [arg2+16*3]
157 pshufb xmm9, xmm11
158 pshufb xmm12, xmm11
159 movdqa xmm8, xmm2
160 movdqa xmm13, xmm3
161 pclmulqdq xmm2, xmm10, 0x0
162 pclmulqdq xmm8, xmm10 , 0x11
163 pclmulqdq xmm3, xmm10, 0x0
164 pclmulqdq xmm13, xmm10 , 0x11
165 pxor xmm2, xmm9
166 xorps xmm2, xmm8
167 pxor xmm3, xmm12
168 xorps xmm3, xmm13
169
170 prefetchnta [arg2+fetch_dist+64]
171 movdqu xmm9, [arg2+16*4]
172 movdqu xmm12, [arg2+16*5]
173 pshufb xmm9, xmm11
174 pshufb xmm12, xmm11
175 movdqa xmm8, xmm4
176 movdqa xmm13, xmm5
177 pclmulqdq xmm4, xmm10, 0x0
178 pclmulqdq xmm8, xmm10 , 0x11
179 pclmulqdq xmm5, xmm10, 0x0
180 pclmulqdq xmm13, xmm10 , 0x11
181 pxor xmm4, xmm9
182 xorps xmm4, xmm8
183 pxor xmm5, xmm12
184 xorps xmm5, xmm13
185
186 prefetchnta [arg2+fetch_dist+96]
187 movdqu xmm9, [arg2+16*6]
188 movdqu xmm12, [arg2+16*7]
189 pshufb xmm9, xmm11
190 pshufb xmm12, xmm11
191 movdqa xmm8, xmm6
192 movdqa xmm13, xmm7
193 pclmulqdq xmm6, xmm10, 0x0
194 pclmulqdq xmm8, xmm10 , 0x11
195 pclmulqdq xmm7, xmm10, 0x0
196 pclmulqdq xmm13, xmm10 , 0x11
197 pxor xmm6, xmm9
198 xorps xmm6, xmm8
199 pxor xmm7, xmm12
200 xorps xmm7, xmm13
201
202 sub arg3, 128
203
204 ; check if there is another 128B in the buffer to be able to fold
205 jge _fold_128_B_loop
206 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
207
208 add arg2, 128
209 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
210 ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
211
212
213 ; fold the 8 xmm registers to 1 xmm register with different constants
214
215 movdqa xmm10, [rk9]
216 movdqa xmm8, xmm0
217 pclmulqdq xmm0, xmm10, 0x11
218 pclmulqdq xmm8, xmm10, 0x0
219 pxor xmm7, xmm8
220 xorps xmm7, xmm0
221
222 movdqa xmm10, [rk11]
223 movdqa xmm8, xmm1
224 pclmulqdq xmm1, xmm10, 0x11
225 pclmulqdq xmm8, xmm10, 0x0
226 pxor xmm7, xmm8
227 xorps xmm7, xmm1
228
229 movdqa xmm10, [rk13]
230 movdqa xmm8, xmm2
231 pclmulqdq xmm2, xmm10, 0x11
232 pclmulqdq xmm8, xmm10, 0x0
233 pxor xmm7, xmm8
234 pxor xmm7, xmm2
235
236 movdqa xmm10, [rk15]
237 movdqa xmm8, xmm3
238 pclmulqdq xmm3, xmm10, 0x11
239 pclmulqdq xmm8, xmm10, 0x0
240 pxor xmm7, xmm8
241 xorps xmm7, xmm3
242
243 movdqa xmm10, [rk17]
244 movdqa xmm8, xmm4
245 pclmulqdq xmm4, xmm10, 0x11
246 pclmulqdq xmm8, xmm10, 0x0
247 pxor xmm7, xmm8
248 pxor xmm7, xmm4
249
250 movdqa xmm10, [rk19]
251 movdqa xmm8, xmm5
252 pclmulqdq xmm5, xmm10, 0x11
253 pclmulqdq xmm8, xmm10, 0x0
254 pxor xmm7, xmm8
255 xorps xmm7, xmm5
256
257 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
258
259 movdqa xmm8, xmm6
260 pclmulqdq xmm6, xmm10, 0x11
261 pclmulqdq xmm8, xmm10, 0x0
262 pxor xmm7, xmm8
263 pxor xmm7, xmm6
264
265
266 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
267 ; instead of a cmp instruction, we use the negative flag with the jl instruction
268 add arg3, 128-16
269 jl _final_reduction_for_128
270
271 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
272 ; we can fold 16 bytes at a time if y>=16
273 ; continue folding 16B at a time
274
275_16B_reduction_loop:
276 movdqa xmm8, xmm7
277 pclmulqdq xmm7, xmm10, 0x11
278 pclmulqdq xmm8, xmm10, 0x0
279 pxor xmm7, xmm8
280 movdqu xmm0, [arg2]
281 pshufb xmm0, xmm11
282 pxor xmm7, xmm0
283 add arg2, 16
284 sub arg3, 16
285 ; instead of a cmp instruction, we utilize the flags with the jge instruction
286 ; equivalent of: cmp arg3, 16-16
287 ; check if there is any more 16B in the buffer to be able to fold
288 jge _16B_reduction_loop
289
290 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
291 ;first, we reduce the data in the xmm7 register
292
293
294_final_reduction_for_128:
295 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
296 add arg3, 16
297 je _128_done
298
299 ; here we are getting data that is less than 16 bytes.
300 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
301 ; after that the registers need to be adjusted.
302_get_last_two_xmms:
303 movdqa xmm2, xmm7
304
305 movdqu xmm1, [arg2 - 16 + arg3]
306 pshufb xmm1, xmm11
307
308 ; get rid of the extra data that was loaded before
309 ; load the shift constant
310 lea rax, [pshufb_shf_table + 16]
311 sub rax, arg3
312 movdqu xmm0, [rax]
313
314 ; shift xmm2 to the left by arg3 bytes
315 pshufb xmm2, xmm0
316
317 ; shift xmm7 to the right by 16-arg3 bytes
318 pxor xmm0, [mask1]
319 pshufb xmm7, xmm0
320 pblendvb xmm1, xmm2 ;xmm0 is implicit
321
322 ; fold 16 Bytes
323 movdqa xmm2, xmm1
324 movdqa xmm8, xmm7
325 pclmulqdq xmm7, xmm10, 0x11
326 pclmulqdq xmm8, xmm10, 0x0
327 pxor xmm7, xmm8
328 pxor xmm7, xmm2
329
330_128_done:
331 ; compute crc of a 128-bit value
332 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
333 movdqa xmm0, xmm7
334
335 ;64b fold
336 pclmulqdq xmm7, xmm10, 0x01 ; H*L
337 pslldq xmm0, 8
338 pxor xmm7, xmm0
339
340 ;barrett reduction
341_barrett:
342 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
343 movdqa xmm0, xmm7
344
345 movdqa xmm1, xmm7
346 pand xmm1, [mask3]
347 pclmulqdq xmm7, xmm10, 0x01
348 pxor xmm7, xmm1
349
350 pclmulqdq xmm7, xmm10, 0x11
351 pxor xmm7, xmm0
352 pextrq rax, xmm7, 0
353
354_cleanup:
355 not rax
356%ifidn __OUTPUT_FORMAT__, win64
357 movdqa xmm6, [rsp + XMM_SAVE + 16*0]
358 movdqa xmm7, [rsp + XMM_SAVE + 16*1]
359 movdqa xmm8, [rsp + XMM_SAVE + 16*2]
360 movdqa xmm9, [rsp + XMM_SAVE + 16*3]
361 movdqa xmm10, [rsp + XMM_SAVE + 16*4]
362 movdqa xmm11, [rsp + XMM_SAVE + 16*5]
363 movdqa xmm12, [rsp + XMM_SAVE + 16*6]
364 movdqa xmm13, [rsp + XMM_SAVE + 16*7]
365%endif
366 add rsp, VARIABLE_OFFSET
367 ret
368
369;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
370;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
371;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
372;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
373
374align 16
375_less_than_256:
376
377 ; check if there is enough buffer to be able to fold 16B at a time
378 cmp arg3, 32
379 jl _less_than_32
380 movdqa xmm11, [SHUF_MASK]
381
382 ; if there is, load the constants
383 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
384
385 movq xmm0, arg1 ; get the initial crc value
386 pslldq xmm0, 8 ; align it to its correct place
387 movdqu xmm7, [arg2] ; load the plaintext
388 pshufb xmm7, xmm11 ; byte-reflect the plaintext
389 pxor xmm7, xmm0
390
391
392 ; update the buffer pointer
393 add arg2, 16
394
395 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
396 sub arg3, 32
397
398 jmp _16B_reduction_loop
399align 16
400_less_than_32:
401 ; mov initial crc to the return value. this is necessary for zero-length buffers.
402 mov rax, arg1
403 test arg3, arg3
404 je _cleanup
405
406 movdqa xmm11, [SHUF_MASK]
407
408 movq xmm0, arg1 ; get the initial crc value
409 pslldq xmm0, 8 ; align it to its correct place
410
411 cmp arg3, 16
412 je _exact_16_left
413 jl _less_than_16_left
414
415 movdqu xmm7, [arg2] ; load the plaintext
416 pshufb xmm7, xmm11 ; byte-reflect the plaintext
417 pxor xmm7, xmm0 ; xor the initial crc value
418 add arg2, 16
419 sub arg3, 16
420 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
421 jmp _get_last_two_xmms
422align 16
423_less_than_16_left:
424 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
425 pxor xmm1, xmm1
426 mov r11, rsp
427 movdqa [r11], xmm1
428
429 ; backup the counter value
430 mov r9, arg3
431 cmp arg3, 8
432 jl _less_than_8_left
433
434 ; load 8 Bytes
435 mov rax, [arg2]
436 mov [r11], rax
437 add r11, 8
438 sub arg3, 8
439 add arg2, 8
440_less_than_8_left:
441
442 cmp arg3, 4
443 jl _less_than_4_left
444
445 ; load 4 Bytes
446 mov eax, [arg2]
447 mov [r11], eax
448 add r11, 4
449 sub arg3, 4
450 add arg2, 4
451_less_than_4_left:
452
453 cmp arg3, 2
454 jl _less_than_2_left
455
456 ; load 2 Bytes
457 mov ax, [arg2]
458 mov [r11], ax
459 add r11, 2
460 sub arg3, 2
461 add arg2, 2
462_less_than_2_left:
463 cmp arg3, 1
464 jl _zero_left
465
466 ; load 1 Byte
467 mov al, [arg2]
468 mov [r11], al
469_zero_left:
470 movdqa xmm7, [rsp]
471 pshufb xmm7, xmm11
472 pxor xmm7, xmm0 ; xor the initial crc value
473
474 ; shl r9, 4
475 lea rax, [pshufb_shf_table + 16]
476 sub rax, r9
477
478 cmp r9, 8
479 jl _end_1to7
480
481_end_8to15:
482 movdqu xmm0, [rax]
483 pxor xmm0, [mask1]
484
485 pshufb xmm7, xmm0
486 jmp _128_done
487
488_end_1to7:
489 ; Right shift (8-length) bytes in XMM
490 add rax, 8
491 movdqu xmm0, [rax]
492 pshufb xmm7,xmm0
493
494 jmp _barrett
495align 16
496_exact_16_left:
497 movdqu xmm7, [arg2]
498 pshufb xmm7, xmm11
499 pxor xmm7, xmm0 ; xor the initial crc value
500
501 jmp _128_done
502
503section .data
504
505; precomputed constants
506align 16
507
508rk1 :
509DQ 0x5f5c3c7eb52fab6
510rk2 :
511DQ 0x4eb938a7d257740e
512rk3 :
513DQ 0x5cf79dea9ac37d6
514rk4 :
515DQ 0x001067e571d7d5c2
516rk5 :
517DQ 0x5f5c3c7eb52fab6
518rk6 :
519DQ 0x0000000000000000
520rk7 :
521DQ 0x578d29d06cc4f872
522rk8 :
523DQ 0x42f0e1eba9ea3693
524rk9 :
525DQ 0xe464f4df5fb60ac1
526rk10 :
527DQ 0xb649c5b35a759cf2
528rk11 :
529DQ 0x9af04e1eff82d0dd
530rk12 :
531DQ 0x6e82e609297f8fe8
532rk13 :
533DQ 0x97c516e98bd2e73
534rk14 :
535DQ 0xb76477b31e22e7b
536rk15 :
537DQ 0x5f6843ca540df020
538rk16 :
539DQ 0xddf4b6981205b83f
540rk17 :
541DQ 0x54819d8713758b2c
542rk18 :
543DQ 0x4a6b90073eb0af5a
544rk19 :
545DQ 0x571bee0a227ef92b
546rk20 :
547DQ 0x44bef2a201b5200c
548
549
550mask1:
551dq 0x8080808080808080, 0x8080808080808080
552mask2:
553dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
554mask3:
555dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
556
557SHUF_MASK:
558dq 0x08090A0B0C0D0E0F, 0x0001020304050607
559
560pshufb_shf_table:
561; use these values for shift constants for the pshufb instruction
562; different alignments result in values as shown:
563; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
564; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
565; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
566; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
567; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
568; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
569; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
570; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
571; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
572; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
573; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
574; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
575; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
576; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
577; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
578dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
579dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
580dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
581dq 0x8080808080808080, 0x8080808080808080
582
583;;; func core, ver, snum
584slversion crc64_ecma_norm_by8, 01, 00, 001a