]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_by4.asm
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;
30 ; Function API:
31 ; UINT16 crc16_t10dif_by4(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
35 ; );
36 ;
37 ; Authors:
38 ; Erdinc Ozturk
39 ; Vinodh Gopal
40 ; James Guilford
41 ;
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44 ;
45
46 %include "reg_sizes.asm"
47
48 %define fetch_dist 1024
49
50 [bits 64]
51 default rel
52
53 section .text
54 %ifidn __OUTPUT_FORMAT__, win64
55 %xdefine arg1 rcx
56 %xdefine arg2 rdx
57 %xdefine arg3 r8
58
59 %xdefine arg1_low32 ecx
60 %else
61 %xdefine arg1 rdi
62 %xdefine arg2 rsi
63 %xdefine arg3 rdx
64
65 %xdefine arg1_low32 edi
66 %endif
67
68 align 16
69 mk_global crc16_t10dif_by4, function
70 crc16_t10dif_by4:
71 endbranch
72
73 ; adjust the 16-bit initial_crc value, scale it to 32 bits
74 shl arg1_low32, 16
75
76 ; After this point, code flow is exactly same as a 32-bit CRC.
77 ; The only difference is before returning eax, we will shift
78 ; it right 16 bits, to scale back to 16 bits.
79
80 sub rsp,16*4+8
81
82 ; push the xmm registers into the stack to maintain
83 movdqa [rsp+16*2],xmm6
84 movdqa [rsp+16*3],xmm7
85
86 ; check if smaller than 128B
87 cmp arg3, 128
88
89 ; for sizes less than 128, we can't fold 64B at a time...
90 jl _less_than_128
91
92
93 ; load the initial crc value
94 movd xmm6, arg1_low32 ; initial crc
95
96 ; crc value does not need to be byte-reflected, but it needs to
97 ; be moved to the high part of the register.
98 ; because data will be byte-reflected and will align with
99 ; initial crc at correct place.
100 pslldq xmm6, 12
101
102 movdqa xmm7, [SHUF_MASK]
103 ; receive the initial 64B data, xor the initial crc value
104 movdqu xmm0, [arg2]
105 movdqu xmm1, [arg2+16]
106 movdqu xmm2, [arg2+32]
107 movdqu xmm3, [arg2+48]
108
109 pshufb xmm0, xmm7
110 ; XOR the initial_crc value
111 pxor xmm0, xmm6
112 pshufb xmm1, xmm7
113 pshufb xmm2, xmm7
114 pshufb xmm3, xmm7
115
116 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
117 ;imm value of pclmulqdq instruction
118 ;will determine which constant to use
119 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
120 ; we subtract 128 instead of 64 to save one instruction from the loop
121 sub arg3, 128
122
123 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
124 ; buffer. The _fold_64_B_loop
125 ; loop will fold 64B at a time until we have 64+y Bytes of buffer
126
127
128 ; fold 64B at a time. This section of the code folds 4 xmm
129 ; registers in parallel
130 _fold_64_B_loop:
131
132 ; update the buffer pointer
133 add arg2, 64 ; buf += 64;
134
135 prefetchnta [arg2+fetch_dist+0]
136 movdqu xmm4, xmm0
137 movdqu xmm5, xmm1
138
139 pclmulqdq xmm0, xmm6 , 0x11
140 pclmulqdq xmm1, xmm6 , 0x11
141
142 pclmulqdq xmm4, xmm6, 0x0
143 pclmulqdq xmm5, xmm6, 0x0
144
145 pxor xmm0, xmm4
146 pxor xmm1, xmm5
147
148 prefetchnta [arg2+fetch_dist+32]
149 movdqu xmm4, xmm2
150 movdqu xmm5, xmm3
151
152 pclmulqdq xmm2, xmm6, 0x11
153 pclmulqdq xmm3, xmm6, 0x11
154
155 pclmulqdq xmm4, xmm6, 0x0
156 pclmulqdq xmm5, xmm6, 0x0
157
158 pxor xmm2, xmm4
159 pxor xmm3, xmm5
160
161 movdqu xmm4, [arg2]
162 movdqu xmm5, [arg2+16]
163 pshufb xmm4, xmm7
164 pshufb xmm5, xmm7
165 pxor xmm0, xmm4
166 pxor xmm1, xmm5
167
168 movdqu xmm4, [arg2+32]
169 movdqu xmm5, [arg2+48]
170 pshufb xmm4, xmm7
171 pshufb xmm5, xmm7
172
173 pxor xmm2, xmm4
174 pxor xmm3, xmm5
175
176 sub arg3, 64
177
178 ; check if there is another 64B in the buffer to be able to fold
179 jge _fold_64_B_loop
180 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
181
182
183 add arg2, 64
184 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
185 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
186
187
188 ; fold the 4 xmm registers to 1 xmm register with different constants
189
190 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
191 ;imm value of pclmulqdq instruction will
192 ;determine which constant to use
193
194 movdqa xmm4, xmm0
195 pclmulqdq xmm0, xmm6, 0x11
196 pclmulqdq xmm4, xmm6, 0x0
197 pxor xmm1, xmm4
198 pxor xmm1, xmm0
199
200 movdqa xmm4, xmm1
201 pclmulqdq xmm1, xmm6, 0x11
202 pclmulqdq xmm4, xmm6, 0x0
203 pxor xmm2, xmm4
204 pxor xmm2, xmm1
205
206 movdqa xmm4, xmm2
207 pclmulqdq xmm2, xmm6, 0x11
208 pclmulqdq xmm4, xmm6, 0x0
209 pxor xmm3, xmm4
210 pxor xmm3, xmm2
211
212
213 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
214 ; instead of a cmp instruction, we use the negative flag with the jl instruction
215 add arg3, 64-16
216 jl _final_reduction_for_128
217
218 ; now we have 16+y bytes left to reduce. 16 Bytes
219 ; is in register xmm3 and the rest is in memory
220 ; we can fold 16 bytes at a time if y>=16
221 ; continue folding 16B at a time
222
223 _16B_reduction_loop:
224 movdqa xmm4, xmm3
225 pclmulqdq xmm3, xmm6, 0x11
226 pclmulqdq xmm4, xmm6, 0x0
227 pxor xmm3, xmm4
228 movdqu xmm0, [arg2]
229 pshufb xmm0, xmm7
230 pxor xmm3, xmm0
231 add arg2, 16
232 sub arg3, 16
233 ; instead of a cmp instruction, we utilize the flags with the jge instruction
234 ; equivalent of: cmp arg3, 16-16
235 ; check if there is any more 16B in the buffer to be able to fold
236 jge _16B_reduction_loop
237
238 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
239 ;first, we reduce the data in the xmm3 register
240
241
242 _final_reduction_for_128:
243 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
244 add arg3, 16
245 je _128_done
246
247 ; here we are getting data that is less than 16 bytes.
248 ; since we know that there was data before the pointer,
249 ; we can offset the input pointer before the actual point,
250 ; to receive exactly 16 bytes.
251 ; after that the registers need to be adjusted.
252 _get_last_two_xmms:
253 movdqa xmm2, xmm3
254
255 movdqu xmm1, [arg2 - 16 + arg3]
256 pshufb xmm1, xmm7
257
258 ; get rid of the extra data that was loaded before
259 ; load the shift constant
260 lea rax, [pshufb_shf_table + 16]
261 sub rax, arg3
262 movdqu xmm0, [rax]
263
264 ; shift xmm2 to the left by arg3 bytes
265 pshufb xmm2, xmm0
266
267 ; shift xmm3 to the right by 16-arg3 bytes
268 pxor xmm0, [mask1]
269 pshufb xmm3, xmm0
270 pblendvb xmm1, xmm2 ;xmm0 is implicit
271
272 ; fold 16 Bytes
273 movdqa xmm2, xmm1
274 movdqa xmm4, xmm3
275 pclmulqdq xmm3, xmm6, 0x11
276 pclmulqdq xmm4, xmm6, 0x0
277 pxor xmm3, xmm4
278 pxor xmm3, xmm2
279
280 _128_done:
281 ; compute crc of a 128-bit value
282 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
283 movdqa xmm0, xmm3
284
285 ;64b fold
286 pclmulqdq xmm3, xmm6, 0x1
287 pslldq xmm0, 8
288 pxor xmm3, xmm0
289
290 ;32b fold
291 movdqa xmm0, xmm3
292
293 pand xmm0, [mask2]
294
295 psrldq xmm3, 12
296 pclmulqdq xmm3, xmm6, 0x10
297 pxor xmm3, xmm0
298
299 ;barrett reduction
300 _barrett:
301 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
302 movdqa xmm0, xmm3
303 pclmulqdq xmm3, xmm6, 0x01
304 pslldq xmm3, 4
305 pclmulqdq xmm3, xmm6, 0x11
306
307 pslldq xmm3, 4
308 pxor xmm3, xmm0
309 pextrd eax, xmm3,1
310
311 _cleanup:
312 ; scale the result back to 16 bits
313 shr eax, 16
314 movdqa xmm6, [rsp+16*2]
315 movdqa xmm7, [rsp+16*3]
316 add rsp,16*4+8
317 ret
318
319
320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
324
325 align 16
326 _less_than_128:
327
328 ; check if there is enough buffer to be able to fold 16B at a time
329 cmp arg3, 32
330 jl _less_than_32
331 movdqa xmm7, [SHUF_MASK]
332
333 ; if there is, load the constants
334 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
335
336 movd xmm0, arg1_low32 ; get the initial crc value
337 pslldq xmm0, 12 ; align it to its correct place
338 movdqu xmm3, [arg2] ; load the plaintext
339 pshufb xmm3, xmm7 ; byte-reflect the plaintext
340 pxor xmm3, xmm0
341
342
343 ; update the buffer pointer
344 add arg2, 16
345
346 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
347 sub arg3, 32
348
349 jmp _16B_reduction_loop
350
351
352 align 16
353 _less_than_32:
354 ; mov initial crc to the return value. this is necessary for zero-length buffers.
355 mov eax, arg1_low32
356 test arg3, arg3
357 je _cleanup
358
359 movdqa xmm7, [SHUF_MASK]
360
361 movd xmm0, arg1_low32 ; get the initial crc value
362 pslldq xmm0, 12 ; align it to its correct place
363
364 cmp arg3, 16
365 je _exact_16_left
366 jl _less_than_16_left
367
368 movdqu xmm3, [arg2] ; load the plaintext
369 pshufb xmm3, xmm7 ; byte-reflect the plaintext
370 pxor xmm3, xmm0 ; xor the initial crc value
371 add arg2, 16
372 sub arg3, 16
373 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
374 jmp _get_last_two_xmms
375
376
377 align 16
378 _less_than_16_left:
379 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
380
381 pxor xmm1, xmm1
382 mov r11, rsp
383 movdqa [r11], xmm1
384
385 cmp arg3, 4
386 jl _only_less_than_4
387
388 ; backup the counter value
389 mov r9, arg3
390 cmp arg3, 8
391 jl _less_than_8_left
392
393 ; load 8 Bytes
394 mov rax, [arg2]
395 mov [r11], rax
396 add r11, 8
397 sub arg3, 8
398 add arg2, 8
399 _less_than_8_left:
400
401 cmp arg3, 4
402 jl _less_than_4_left
403
404 ; load 4 Bytes
405 mov eax, [arg2]
406 mov [r11], eax
407 add r11, 4
408 sub arg3, 4
409 add arg2, 4
410 _less_than_4_left:
411
412 cmp arg3, 2
413 jl _less_than_2_left
414
415 ; load 2 Bytes
416 mov ax, [arg2]
417 mov [r11], ax
418 add r11, 2
419 sub arg3, 2
420 add arg2, 2
421 _less_than_2_left:
422 cmp arg3, 1
423 jl _zero_left
424
425 ; load 1 Byte
426 mov al, [arg2]
427 mov [r11], al
428 _zero_left:
429 movdqa xmm3, [rsp]
430 pshufb xmm3, xmm7
431 pxor xmm3, xmm0 ; xor the initial crc value
432
433 ; shl r9, 4
434 lea rax, [pshufb_shf_table + 16]
435 sub rax, r9
436 movdqu xmm0, [rax]
437 pxor xmm0, [mask1]
438
439 pshufb xmm3, xmm0
440 jmp _128_done
441
442 align 16
443 _exact_16_left:
444 movdqu xmm3, [arg2]
445 pshufb xmm3, xmm7
446 pxor xmm3, xmm0 ; xor the initial crc value
447
448 jmp _128_done
449
450 _only_less_than_4:
451 cmp arg3, 3
452 jl _only_less_than_3
453
454 ; load 3 Bytes
455 mov al, [arg2]
456 mov [r11], al
457
458 mov al, [arg2+1]
459 mov [r11+1], al
460
461 mov al, [arg2+2]
462 mov [r11+2], al
463
464 movdqa xmm3, [rsp]
465 pshufb xmm3, xmm7
466 pxor xmm3, xmm0 ; xor the initial crc value
467
468 psrldq xmm3, 5
469
470 jmp _barrett
471 _only_less_than_3:
472 cmp arg3, 2
473 jl _only_less_than_2
474
475 ; load 2 Bytes
476 mov al, [arg2]
477 mov [r11], al
478
479 mov al, [arg2+1]
480 mov [r11+1], al
481
482 movdqa xmm3, [rsp]
483 pshufb xmm3, xmm7
484 pxor xmm3, xmm0 ; xor the initial crc value
485
486 psrldq xmm3, 6
487
488 jmp _barrett
489 _only_less_than_2:
490
491 ; load 1 Byte
492 mov al, [arg2]
493 mov [r11], al
494
495 movdqa xmm3, [rsp]
496 pshufb xmm3, xmm7
497 pxor xmm3, xmm0 ; xor the initial crc value
498
499 psrldq xmm3, 7
500
501 jmp _barrett
502
503 section .data
504
505 ; precomputed constants
506 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
507 align 16
508 ; Q = 0x18BB70000
509 ; rk1 = 2^(32*3) mod Q << 32
510 ; rk2 = 2^(32*5) mod Q << 32
511 ; rk3 = 2^(32*15) mod Q << 32
512 ; rk4 = 2^(32*17) mod Q << 32
513 ; rk5 = 2^(32*3) mod Q << 32
514 ; rk6 = 2^(32*2) mod Q << 32
515 ; rk7 = floor(2^64/Q)
516 ; rk8 = Q
517 rk1:
518 DQ 0x2d56000000000000
519 rk2:
520 DQ 0x06df000000000000
521 rk3:
522 DQ 0x044c000000000000
523 rk4:
524 DQ 0xe658000000000000
525 rk5:
526 DQ 0x2d56000000000000
527 rk6:
528 DQ 0x1368000000000000
529 rk7:
530 DQ 0x00000001f65a57f8
531 rk8:
532 DQ 0x000000018bb70000
533 mask1:
534 dq 0x8080808080808080, 0x8080808080808080
535 mask2:
536 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
537
538 SHUF_MASK:
539 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
540
541 pshufb_shf_table:
542 ; use these values for shift constants for the pshufb instruction
543 ; different alignments result in values as shown:
544 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
545 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
546 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
547 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
548 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
549 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
550 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
551 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
552 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
553 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
554 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
555 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
556 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
557 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
558 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
559 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
560 dq 0x0706050403020100, 0x000e0d0c0b0a0908
561
562 ;;; func core, ver, snum
563 slversion crc16_t10dif_by4, 05, 02, 0016