]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc64_iso_norm_by16_10.asm
c9f38b3dae01abc8acf48bbe0dbc58e5045fe936
[ceph.git] / ceph / src / isa-l / crc / crc64_iso_norm_by16_10.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29
30 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; Function API:
32 ; uint64_t crc64_iso_norm_by16_10(
33 ; uint64_t init_crc, //initial CRC value, 64 bits
34 ; const unsigned char *buf, //buffer pointer to calculate CRC on
35 ; uint64_t len //buffer length in bytes (64-bit data)
36 ; );
37 ;
38 %include "reg_sizes.asm"
39
40 %ifndef FUNCTION_NAME
41 %define FUNCTION_NAME crc64_iso_norm_by16_10
42 %endif
43
44 %if (AS_FEATURE_LEVEL) >= 10
45
46 %define fetch_dist 1024
47
48 [bits 64]
49 default rel
50
51 section .text
52
53
54 %ifidn __OUTPUT_FORMAT__, win64
55 %xdefine arg1 rcx
56 %xdefine arg2 rdx
57 %xdefine arg3 r8
58 %else
59 %xdefine arg1 rdi
60 %xdefine arg2 rsi
61 %xdefine arg3 rdx
62 %endif
63
64 %ifidn __OUTPUT_FORMAT__, win64
65 %define XMM_SAVE 16*2
66 %define VARIABLE_OFFSET 16*12+8
67 %else
68 %define VARIABLE_OFFSET 16*2+8
69 %endif
70
71 align 16
72 global FUNCTION_NAME:ISAL_SYM_TYPE_FUNCTION
73 FUNCTION_NAME:
74 not arg1
75 sub rsp, VARIABLE_OFFSET
76
77 %ifidn __OUTPUT_FORMAT__, win64
78 ; push the xmm registers into the stack to maintain
79 vmovdqa [rsp + XMM_SAVE + 16*0], xmm6
80 vmovdqa [rsp + XMM_SAVE + 16*1], xmm7
81 vmovdqa [rsp + XMM_SAVE + 16*2], xmm8
82 vmovdqa [rsp + XMM_SAVE + 16*3], xmm9
83 vmovdqa [rsp + XMM_SAVE + 16*4], xmm10
84 vmovdqa [rsp + XMM_SAVE + 16*5], xmm11
85 vmovdqa [rsp + XMM_SAVE + 16*6], xmm12
86 vmovdqa [rsp + XMM_SAVE + 16*7], xmm13
87 vmovdqa [rsp + XMM_SAVE + 16*8], xmm14
88 vmovdqa [rsp + XMM_SAVE + 16*9], xmm15
89 %endif
90 vbroadcasti32x4 zmm18, [SHUF_MASK]
91 cmp arg3, 256
92 jl _less_than_256
93
94 ; load the initial crc value
95 vmovq xmm10, arg1 ; initial crc
96
97 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
98 ; because data will be byte-reflected and will align with initial crc at correct place.
99 vpslldq xmm10, 8
100
101 ; receive the initial 128B data, xor the initial crc value
102 vmovdqu8 zmm0, [arg2+16*0]
103 vmovdqu8 zmm4, [arg2+16*4]
104 vpshufb zmm0, zmm0, zmm18
105 vpshufb zmm4, zmm4, zmm18
106 vpxorq zmm0, zmm10
107 vbroadcasti32x4 zmm10, [rk3] ;zmm10 has rk3 and rk4
108 ;imm value of pclmulqdq instruction will determine which constant to use
109 sub arg3, 256
110 cmp arg3, 256
111 jl _fold_128_B_loop
112
113 vmovdqu8 zmm7, [arg2+16*8]
114 vmovdqu8 zmm8, [arg2+16*12]
115 vpshufb zmm7, zmm7, zmm18
116 vpshufb zmm8, zmm8, zmm18
117 vbroadcasti32x4 zmm16, [rk_1] ;zmm16 has rk-1 and rk-2
118 sub arg3, 256
119
120 _fold_256_B_loop:
121 add arg2, 256
122 vmovdqu8 zmm3, [arg2+16*0]
123 vpshufb zmm3, zmm3, zmm18
124 vpclmulqdq zmm1, zmm0, zmm16, 0x00
125 vpclmulqdq zmm2, zmm0, zmm16, 0x11
126 vpxorq zmm0, zmm1, zmm2
127 vpxorq zmm0, zmm0, zmm3
128
129 vmovdqu8 zmm9, [arg2+16*4]
130 vpshufb zmm9, zmm9, zmm18
131 vpclmulqdq zmm5, zmm4, zmm16, 0x00
132 vpclmulqdq zmm6, zmm4, zmm16, 0x11
133 vpxorq zmm4, zmm5, zmm6
134 vpxorq zmm4, zmm4, zmm9
135
136 vmovdqu8 zmm11, [arg2+16*8]
137 vpshufb zmm11, zmm11, zmm18
138 vpclmulqdq zmm12, zmm7, zmm16, 0x00
139 vpclmulqdq zmm13, zmm7, zmm16, 0x11
140 vpxorq zmm7, zmm12, zmm13
141 vpxorq zmm7, zmm7, zmm11
142
143 vmovdqu8 zmm17, [arg2+16*12]
144 vpshufb zmm17, zmm17, zmm18
145 vpclmulqdq zmm14, zmm8, zmm16, 0x00
146 vpclmulqdq zmm15, zmm8, zmm16, 0x11
147 vpxorq zmm8, zmm14, zmm15
148 vpxorq zmm8, zmm8, zmm17
149
150 sub arg3, 256
151 jge _fold_256_B_loop
152
153 ;; Fold 256 into 128
154 add arg2, 256
155 vpclmulqdq zmm1, zmm0, zmm10, 0x00
156 vpclmulqdq zmm2, zmm0, zmm10, 0x11
157 vpternlogq zmm7, zmm1, zmm2, 0x96 ; xor ABC
158
159 vpclmulqdq zmm5, zmm4, zmm10, 0x00
160 vpclmulqdq zmm6, zmm4, zmm10, 0x11
161 vpternlogq zmm8, zmm5, zmm6, 0x96 ; xor ABC
162
163 vmovdqa32 zmm0, zmm7
164 vmovdqa32 zmm4, zmm8
165
166 add arg3, 128
167 jmp _fold_128_B_register
168
169 ; fold 128B at a time. This section of the code folds 2 zmm registers in parallel
170 _fold_128_B_loop:
171 add arg2, 128 ; update the buffer pointer
172 vmovdqu8 zmm8, [arg2+16*0]
173 vpshufb zmm8, zmm8, zmm18
174 vpclmulqdq zmm1, zmm0, zmm10, 0x00
175 vpclmulqdq zmm2, zmm0, zmm10, 0x11
176 vpxorq zmm0, zmm1, zmm2
177 vpxorq zmm0, zmm0, zmm8
178
179 vmovdqu8 zmm9, [arg2+16*4]
180 vpshufb zmm9, zmm9, zmm18
181 vpclmulqdq zmm5, zmm4, zmm10, 0x00
182 vpclmulqdq zmm6, zmm4, zmm10, 0x11
183 vpxorq zmm4, zmm5, zmm6
184 vpxorq zmm4, zmm4, zmm9
185 sub arg3, 128
186 jge _fold_128_B_loop
187 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188
189 add arg2, 128
190 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
191 ; the 128B of folded data is in 2 zmm registers: zmm0, zmm4
192
193 _fold_128_B_register:
194 ; fold the 8 128b parts into 1 xmm register with different constants
195 vmovdqu8 zmm16, [rk9] ; multiply by rk9-rk16
196 vmovdqu8 zmm11, [rk17] ; multiply by rk17-rk20, rk1,rk2, 0,0
197 vpclmulqdq zmm1, zmm0, zmm16, 0x00
198 vpclmulqdq zmm2, zmm0, zmm16, 0x11
199 vextracti64x2 xmm7, zmm4, 3 ; save last that has no multiplicand
200
201 vpclmulqdq zmm5, zmm4, zmm11, 0x00
202 vpclmulqdq zmm6, zmm4, zmm11, 0x11
203 vmovdqa xmm10, [rk1] ; Needed later in reduction loop
204 vpternlogq zmm1, zmm2, zmm5, 0x96 ; xor ABC
205 vpternlogq zmm1, zmm6, zmm7, 0x96 ; xor ABC
206
207 vshufi64x2 zmm8, zmm1, zmm1, 0x4e ; Swap 1,0,3,2 - 01 00 11 10
208 vpxorq ymm8, ymm8, ymm1
209 vextracti64x2 xmm5, ymm8, 1
210 vpxorq xmm7, xmm5, xmm8
211
212 ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
213 ; instead of a cmp instruction, we use the negative flag with the jl instruction
214 add arg3, 128-16
215 jl _final_reduction_for_128
216
217 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
218 ; we can fold 16 bytes at a time if y>=16
219 ; continue folding 16B at a time
220
221 _16B_reduction_loop:
222 vmovdqa xmm8, xmm7
223 vpclmulqdq xmm7, xmm10, 0x11
224 vpclmulqdq xmm8, xmm10, 0x00
225 vpxor xmm7, xmm8
226 vmovdqu xmm0, [arg2]
227 vpshufb xmm0, xmm0, xmm18
228 vpxor xmm7, xmm0
229 add arg2, 16
230 sub arg3, 16
231 ; instead of a cmp instruction, we utilize the flags with the jge instruction
232 ; equivalent of: cmp arg3, 16-16
233 ; check if there is any more 16B in the buffer to be able to fold
234 jge _16B_reduction_loop
235
236 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
237 ;first, we reduce the data in the xmm7 register
238
239
240 _final_reduction_for_128:
241 add arg3, 16
242 je _128_done
243 ; here we are getting data that is less than 16 bytes.
244 ; since we know that there was data before the pointer, we can offset
245 ; the input pointer before the actual point, to receive exactly 16 bytes.
246 ; after that the registers need to be adjusted.
247 _get_last_two_xmms:
248
249 vmovdqa xmm2, xmm7
250 vmovdqu xmm1, [arg2 - 16 + arg3]
251 vpshufb xmm1, xmm18
252
253 ; get rid of the extra data that was loaded before
254 ; load the shift constant
255 lea rax, [pshufb_shf_table + 16]
256 sub rax, arg3
257 vmovdqu xmm0, [rax]
258
259 ; shift xmm2 to the left by arg3 bytes
260 vpshufb xmm2, xmm0
261
262 ; shift xmm7 to the right by 16-arg3 bytes
263 vpxor xmm0, [mask1]
264 vpshufb xmm7, xmm0
265 vpblendvb xmm1, xmm1, xmm2, xmm0
266
267 ; fold 16 Bytes
268 vmovdqa xmm2, xmm1
269 vmovdqa xmm8, xmm7
270 vpclmulqdq xmm7, xmm10, 0x11
271 vpclmulqdq xmm8, xmm10, 0x0
272 vpxor xmm7, xmm8
273 vpxor xmm7, xmm2
274
275 _128_done:
276 ; compute crc of a 128-bit value
277 vmovdqa xmm10, [rk5]
278 vmovdqa xmm0, xmm7
279
280 ;64b fold
281 vpclmulqdq xmm7, xmm10, 0x01 ; H*L
282 vpslldq xmm0, 8
283 vpxor xmm7, xmm0
284
285 ;barrett reduction
286 _barrett:
287 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
288 vmovdqa xmm0, xmm7
289
290 vmovdqa xmm1, xmm7
291 vpand xmm1, [mask3]
292 vpclmulqdq xmm7, xmm10, 0x01
293 vpxor xmm7, xmm1
294
295 vpclmulqdq xmm7, xmm10, 0x11
296 vpxor xmm7, xmm0
297 vpextrq rax, xmm7, 0
298
299 _cleanup:
300 not rax
301
302
303 %ifidn __OUTPUT_FORMAT__, win64
304 vmovdqa xmm6, [rsp + XMM_SAVE + 16*0]
305 vmovdqa xmm7, [rsp + XMM_SAVE + 16*1]
306 vmovdqa xmm8, [rsp + XMM_SAVE + 16*2]
307 vmovdqa xmm9, [rsp + XMM_SAVE + 16*3]
308 vmovdqa xmm10, [rsp + XMM_SAVE + 16*4]
309 vmovdqa xmm11, [rsp + XMM_SAVE + 16*5]
310 vmovdqa xmm12, [rsp + XMM_SAVE + 16*6]
311 vmovdqa xmm13, [rsp + XMM_SAVE + 16*7]
312 vmovdqa xmm14, [rsp + XMM_SAVE + 16*8]
313 vmovdqa xmm15, [rsp + XMM_SAVE + 16*9]
314 %endif
315 add rsp, VARIABLE_OFFSET
316 ret
317
318 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322
323 align 16
324 _less_than_256:
325
326 ; check if there is enough buffer to be able to fold 16B at a time
327 cmp arg3, 32
328 jl _less_than_32
329
330 ; if there is, load the constants
331 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
332
333 vmovq xmm0, arg1 ; get the initial crc value
334 vpslldq xmm0, 8 ; align it to its correct place
335 vmovdqu xmm7, [arg2] ; load the plaintext
336 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
337 vpxor xmm7, xmm0
338
339 ; update the buffer pointer
340 add arg2, 16
341
342 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
343 sub arg3, 32
344
345 jmp _16B_reduction_loop
346
347 align 16
348 _less_than_32:
349 ; mov initial crc to the return value. this is necessary for zero-length buffers.
350 mov rax, arg1
351 test arg3, arg3
352 je _cleanup
353
354 vmovq xmm0, arg1 ; get the initial crc value
355 vpslldq xmm0, 8 ; align it to its correct place
356
357 cmp arg3, 16
358 je _exact_16_left
359 jl _less_than_16_left
360
361 vmovdqu xmm7, [arg2] ; load the plaintext
362 vpshufb xmm7, xmm18 ; byte-reflect the plaintext
363 vpxor xmm7, xmm0 ; xor the initial crc value
364 add arg2, 16
365 sub arg3, 16
366 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
367 jmp _get_last_two_xmms
368
369
370 align 16
371 _less_than_16_left:
372 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
373
374 vpxor xmm1, xmm1
375 mov r11, rsp
376 vmovdqa [r11], xmm1
377
378 ; backup the counter value
379 mov r9, arg3
380 cmp arg3, 8
381 jl _less_than_8_left
382
383 ; load 8 Bytes
384 mov rax, [arg2]
385 mov [r11], rax
386 add r11, 8
387 sub arg3, 8
388 add arg2, 8
389 _less_than_8_left:
390
391 cmp arg3, 4
392 jl _less_than_4_left
393
394 ; load 4 Bytes
395 mov eax, [arg2]
396 mov [r11], eax
397 add r11, 4
398 sub arg3, 4
399 add arg2, 4
400 _less_than_4_left:
401
402 cmp arg3, 2
403 jl _less_than_2_left
404
405 ; load 2 Bytes
406 mov ax, [arg2]
407 mov [r11], ax
408 add r11, 2
409 sub arg3, 2
410 add arg2, 2
411 _less_than_2_left:
412 cmp arg3, 1
413 jl _zero_left
414
415 ; load 1 Byte
416 mov al, [arg2]
417 mov [r11], al
418
419 _zero_left:
420 vmovdqa xmm7, [rsp]
421 vpshufb xmm7, xmm18
422 vpxor xmm7, xmm0 ; xor the initial crc value
423
424 lea rax, [pshufb_shf_table + 16]
425 sub rax, r9
426
427 cmp r9, 8
428 jl _end_1to7
429
430 _end_8to15:
431 vmovdqu xmm0, [rax]
432 vpxor xmm0, [mask1]
433
434 vpshufb xmm7, xmm0
435 jmp _128_done
436
437 _end_1to7:
438 ; Right shift (8-length) bytes in XMM
439 add rax, 8
440 vmovdqu xmm0, [rax]
441 vpshufb xmm7,xmm0
442
443 jmp _barrett
444
445 align 16
446 _exact_16_left:
447 vmovdqu xmm7, [arg2]
448 vpshufb xmm7, xmm18
449 vpxor xmm7, xmm0 ; xor the initial crc value
450
451 jmp _128_done
452
453 section .data
454 align 32
455
456 %ifndef USE_CONSTS
457 ; precomputed constants
458 rk_1: dq 0x0000001a00000144
459 rk_2: dq 0x0000015e00001dac
460 rk1: dq 0x0000000000000145
461 rk2: dq 0x0000000000001db7
462 rk3: dq 0x000100000001001a
463 rk4: dq 0x001b0000001b015e
464 rk5: dq 0x0000000000000145
465 rk6: dq 0x0000000000000000
466 rk7: dq 0x000000000000001b
467 rk8: dq 0x000000000000001b
468 rk9: dq 0x0150145145145015
469 rk10: dq 0x1c71db6db6db71c7
470 rk11: dq 0x0001110110110111
471 rk12: dq 0x001aab1ab1ab1aab
472 rk13: dq 0x0000014445014445
473 rk14: dq 0x00001daab71daab7
474 rk15: dq 0x0000000101000101
475 rk16: dq 0x0000001b1b001b1b
476 rk17: dq 0x0000000001514515
477 rk18: dq 0x000000001c6db6c7
478 rk19: dq 0x0000000000011011
479 rk20: dq 0x00000000001ab1ab
480
481 rk_1b: dq 0x0000000000000145
482 rk_2b: dq 0x0000000000001db7
483 dq 0x0000000000000000
484 dq 0x0000000000000000
485 %else
486 INCLUDE_CONSTS
487 %endif
488
489 mask1: dq 0x8080808080808080, 0x8080808080808080
490 mask2: dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
491 mask3: dq 0x0000000000000000, 0xFFFFFFFFFFFFFFFF
492
493 SHUF_MASK: dq 0x08090A0B0C0D0E0F, 0x0001020304050607
494
495 pshufb_shf_table:
496 ; use these values for shift constants for the pshufb instruction
497 ; different alignments result in values as shown:
498 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
499 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
500 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
501 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
502 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
503 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
504 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
505 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
506 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
507 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
508 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
509 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
510 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
511 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
512 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
513 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
514 dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
515 dq 0x8080808080808080, 0x0f0e0d0c0b0a0908
516 dq 0x8080808080808080, 0x8080808080808080
517
518
519 %else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
520 %ifidn __OUTPUT_FORMAT__, win64
521 global no_ %+ FUNCTION_NAME
522 no_ %+ FUNCTION_NAME %+ :
523 %endif
524 %endif ; (AS_FEATURE_LEVEL) >= 10