]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_copy_by4.asm
Import ceph 15.2.8
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_copy_by4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;
30 ; Function API:
31 ; UINT16 crc16_t10dif_copy_by4(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; unsigned char *dst, //buffer pointer destination for copy
34 ; const unsigned char *src, //buffer pointer to calculate CRC on
35 ; UINT64 len //buffer length in bytes (64-bit data)
36 ; );
37 ;
38 ; Authors:
39 ; Erdinc Ozturk
40 ; Vinodh Gopal
41 ; James Guilford
42 ;
43 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
45 ;
46
47 %include "reg_sizes.asm"
48
49 %define fetch_dist 1024
50
51 [bits 64]
52 default rel
53
54 section .text
55 %ifidn __OUTPUT_FORMAT__, win64
56 %xdefine arg1 rcx
57 %xdefine arg2 rdx
58 %xdefine arg3 r8
59 %xdefine arg4 r9
60 %xdefine tmp1 r10
61 %xdefine arg1_low32 ecx
62 %else
63 %xdefine arg1 rdi
64 %xdefine arg2 rsi
65 %xdefine arg3 rdx
66 %xdefine arg4 rcx
67 %xdefine tmp1 r10
68 %xdefine arg1_low32 edi
69 %endif
70
71 align 16
72 global crc16_t10dif_copy_by4:ISAL_SYM_TYPE_FUNCTION
73 crc16_t10dif_copy_by4:
74
75 ; adjust the 16-bit initial_crc value, scale it to 32 bits
76 shl arg1_low32, 16
77
78 ; After this point, code flow is exactly same as a 32-bit CRC.
79 ; The only difference is before returning eax, we will shift
80 ; it right 16 bits, to scale back to 16 bits.
81
82 sub rsp,16*4+8
83
84 ; push the xmm registers into the stack to maintain
85 movdqa [rsp+16*2],xmm6
86 movdqa [rsp+16*3],xmm7
87
88 ; check if smaller than 128B
89 cmp arg4, 128
90
91 ; for sizes less than 128, we can't fold 64B at a time...
92 jl _less_than_128
93
94
95 ; load the initial crc value
96 movd xmm6, arg1_low32 ; initial crc
97
98 ; crc value does not need to be byte-reflected, but it needs to
99 ; be moved to the high part of the register.
100 ; because data will be byte-reflected and will align with
101 ; initial crc at correct place.
102 pslldq xmm6, 12
103
104 movdqa xmm7, [SHUF_MASK]
105 ; receive the initial 64B data, xor the initial crc value
106 movdqu xmm0, [arg3]
107 movdqu xmm1, [arg3+16]
108 movdqu xmm2, [arg3+32]
109 movdqu xmm3, [arg3+48]
110
111 ; copy initial data
112 movdqu [arg2], xmm0
113 movdqu [arg2+16], xmm1
114 movdqu [arg2+32], xmm2
115 movdqu [arg2+48], xmm3
116
117 pshufb xmm0, xmm7
118 ; XOR the initial_crc value
119 pxor xmm0, xmm6
120 pshufb xmm1, xmm7
121 pshufb xmm2, xmm7
122 pshufb xmm3, xmm7
123
124 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
125 ;imm value of pclmulqdq instruction
126 ;will determine which constant to use
127 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
128 ; we subtract 128 instead of 64 to save one instruction from the loop
129 sub arg4, 128
130
131 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
132 ; buffer. The _fold_64_B_loop
133 ; loop will fold 64B at a time until we have 64+y Bytes of buffer
134
135
136 ; fold 64B at a time. This section of the code folds 4 xmm
137 ; registers in parallel
138 _fold_64_B_loop:
139
140 ; update the buffer pointer
141 add arg3, 64 ; buf += 64;
142 add arg2, 64
143
144 prefetchnta [arg3+fetch_dist+0]
145 movdqu xmm4, xmm0
146 movdqu xmm5, xmm1
147
148 pclmulqdq xmm0, xmm6 , 0x11
149 pclmulqdq xmm1, xmm6 , 0x11
150
151 pclmulqdq xmm4, xmm6, 0x0
152 pclmulqdq xmm5, xmm6, 0x0
153
154 pxor xmm0, xmm4
155 pxor xmm1, xmm5
156
157 prefetchnta [arg3+fetch_dist+32]
158 movdqu xmm4, xmm2
159 movdqu xmm5, xmm3
160
161 pclmulqdq xmm2, xmm6, 0x11
162 pclmulqdq xmm3, xmm6, 0x11
163
164 pclmulqdq xmm4, xmm6, 0x0
165 pclmulqdq xmm5, xmm6, 0x0
166
167 pxor xmm2, xmm4
168 pxor xmm3, xmm5
169
170 movdqu xmm4, [arg3]
171 movdqu xmm5, [arg3+16]
172 movdqu [arg2], xmm4
173 movdqu [arg2+16], xmm5
174 pshufb xmm4, xmm7
175 pshufb xmm5, xmm7
176 pxor xmm0, xmm4
177 pxor xmm1, xmm5
178
179 movdqu xmm4, [arg3+32]
180 movdqu xmm5, [arg3+48]
181 movdqu [arg2+32], xmm4
182 movdqu [arg2+48], xmm5
183 pshufb xmm4, xmm7
184 pshufb xmm5, xmm7
185
186 pxor xmm2, xmm4
187 pxor xmm3, xmm5
188
189 sub arg4, 64
190
191 ; check if there is another 64B in the buffer to be able to fold
192 jge _fold_64_B_loop
193 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
194
195
196 add arg3, 64
197 add arg2, 64
198 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
199 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
200
201
202 ; fold the 4 xmm registers to 1 xmm register with different constants
203
204 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
205 ;imm value of pclmulqdq instruction will
206 ;determine which constant to use
207
208 movdqa xmm4, xmm0
209 pclmulqdq xmm0, xmm6, 0x11
210 pclmulqdq xmm4, xmm6, 0x0
211 pxor xmm1, xmm4
212 pxor xmm1, xmm0
213
214 movdqa xmm4, xmm1
215 pclmulqdq xmm1, xmm6, 0x11
216 pclmulqdq xmm4, xmm6, 0x0
217 pxor xmm2, xmm4
218 pxor xmm2, xmm1
219
220 movdqa xmm4, xmm2
221 pclmulqdq xmm2, xmm6, 0x11
222 pclmulqdq xmm4, xmm6, 0x0
223 pxor xmm3, xmm4
224 pxor xmm3, xmm2
225
226
227 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
228 ; instead of a cmp instruction, we use the negative flag with the jl instruction
229 add arg4, 64-16
230 jl _final_reduction_for_128
231
232 ; now we have 16+y bytes left to reduce. 16 Bytes
233 ; is in register xmm3 and the rest is in memory
234 ; we can fold 16 bytes at a time if y>=16
235 ; continue folding 16B at a time
236
237 _16B_reduction_loop:
238 movdqa xmm4, xmm3
239 pclmulqdq xmm3, xmm6, 0x11
240 pclmulqdq xmm4, xmm6, 0x0
241 pxor xmm3, xmm4
242 movdqu xmm0, [arg3]
243 movdqu [arg2], xmm0
244 pshufb xmm0, xmm7
245 pxor xmm3, xmm0
246 add arg3, 16
247 add arg2, 16
248 sub arg4, 16
249 ; instead of a cmp instruction, we utilize the flags with the jge instruction
250 ; equivalent of: cmp arg4, 16-16
251 ; check if there is any more 16B in the buffer to be able to fold
252 jge _16B_reduction_loop
253
254 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
255 ;first, we reduce the data in the xmm3 register
256
257
258 _final_reduction_for_128:
259 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
260 add arg4, 16
261 je _128_done
262
263 ; here we are getting data that is less than 16 bytes.
264 ; since we know that there was data before the pointer,
265 ; we can offset the input pointer before the actual point,
266 ; to receive exactly 16 bytes.
267 ; after that the registers need to be adjusted.
268 _get_last_two_xmms:
269 movdqa xmm2, xmm3
270
271 movdqu xmm1, [arg3 - 16 + arg4]
272 movdqu [arg2 - 16 + arg4], xmm1
273 pshufb xmm1, xmm7
274
275 ; get rid of the extra data that was loaded before
276 ; load the shift constant
277 lea rax, [pshufb_shf_table + 16]
278 sub rax, arg4
279 movdqu xmm0, [rax]
280
281 ; shift xmm2 to the left by arg4 bytes
282 pshufb xmm2, xmm0
283
284 ; shift xmm3 to the right by 16-arg4 bytes
285 pxor xmm0, [mask1]
286 pshufb xmm3, xmm0
287 pblendvb xmm1, xmm2 ;xmm0 is implicit
288
289 ; fold 16 Bytes
290 movdqa xmm2, xmm1
291 movdqa xmm4, xmm3
292 pclmulqdq xmm3, xmm6, 0x11
293 pclmulqdq xmm4, xmm6, 0x0
294 pxor xmm3, xmm4
295 pxor xmm3, xmm2
296
297 _128_done:
298 ; compute crc of a 128-bit value
299 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
300 movdqa xmm0, xmm3
301
302 ;64b fold
303 pclmulqdq xmm3, xmm6, 0x1
304 pslldq xmm0, 8
305 pxor xmm3, xmm0
306
307 ;32b fold
308 movdqa xmm0, xmm3
309
310 pand xmm0, [mask2]
311
312 psrldq xmm3, 12
313 pclmulqdq xmm3, xmm6, 0x10
314 pxor xmm3, xmm0
315
316 ;barrett reduction
317 _barrett:
318 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
319 movdqa xmm0, xmm3
320 pclmulqdq xmm3, xmm6, 0x01
321 pslldq xmm3, 4
322 pclmulqdq xmm3, xmm6, 0x11
323
324 pslldq xmm3, 4
325 pxor xmm3, xmm0
326 pextrd eax, xmm3,1
327
328 _cleanup:
329 ; scale the result back to 16 bits
330 shr eax, 16
331 movdqa xmm6, [rsp+16*2]
332 movdqa xmm7, [rsp+16*3]
333 add rsp,16*4+8
334 ret
335
336
337 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
338 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
339 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
340 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
341
342 align 16
343 _less_than_128:
344
345 ; check if there is enough buffer to be able to fold 16B at a time
346 cmp arg4, 32
347 jl _less_than_32
348 movdqa xmm7, [SHUF_MASK]
349
350 ; if there is, load the constants
351 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
352
353 movd xmm0, arg1_low32 ; get the initial crc value
354 pslldq xmm0, 12 ; align it to its correct place
355 movdqu xmm3, [arg3] ; load the plaintext
356 movdqu [arg2], xmm3 ; store copy
357 pshufb xmm3, xmm7 ; byte-reflect the plaintext
358 pxor xmm3, xmm0
359
360
361 ; update the buffer pointer
362 add arg3, 16
363 add arg2, 16
364
365 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
366 sub arg4, 32
367
368 jmp _16B_reduction_loop
369
370
371 align 16
372 _less_than_32:
373 ; mov initial crc to the return value. this is necessary for zero-length buffers.
374 mov eax, arg1_low32
375 test arg4, arg4
376 je _cleanup
377
378 movdqa xmm7, [SHUF_MASK]
379
380 movd xmm0, arg1_low32 ; get the initial crc value
381 pslldq xmm0, 12 ; align it to its correct place
382
383 cmp arg4, 16
384 je _exact_16_left
385 jl _less_than_16_left
386
387 movdqu xmm3, [arg3] ; load the plaintext
388 movdqu [arg2], xmm3 ; store the copy
389 pshufb xmm3, xmm7 ; byte-reflect the plaintext
390 pxor xmm3, xmm0 ; xor the initial crc value
391 add arg3, 16
392 add arg2, 16
393 sub arg4, 16
394 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
395 jmp _get_last_two_xmms
396
397
398 align 16
399 _less_than_16_left:
400 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
401
402 pxor xmm1, xmm1
403 mov r11, rsp
404 movdqa [r11], xmm1
405
406 cmp arg4, 4
407 jl _only_less_than_4
408
409 ; backup the counter value
410 mov tmp1, arg4
411 cmp arg4, 8
412 jl _less_than_8_left
413
414 ; load 8 Bytes
415 mov rax, [arg3]
416 mov [arg2], rax
417 mov [r11], rax
418 add r11, 8
419 sub arg4, 8
420 add arg3, 8
421 add arg2, 8
422 _less_than_8_left:
423
424 cmp arg4, 4
425 jl _less_than_4_left
426
427 ; load 4 Bytes
428 mov eax, [arg3]
429 mov [arg2], eax
430 mov [r11], eax
431 add r11, 4
432 sub arg4, 4
433 add arg3, 4
434 add arg2, 4
435 _less_than_4_left:
436
437 cmp arg4, 2
438 jl _less_than_2_left
439
440 ; load 2 Bytes
441 mov ax, [arg3]
442 mov [arg2], ax
443 mov [r11], ax
444 add r11, 2
445 sub arg4, 2
446 add arg3, 2
447 add arg2, 2
448 _less_than_2_left:
449 cmp arg4, 1
450 jl _zero_left
451
452 ; load 1 Byte
453 mov al, [arg3]
454 mov [arg2], al
455 mov [r11], al
456 _zero_left:
457 movdqa xmm3, [rsp]
458 pshufb xmm3, xmm7
459 pxor xmm3, xmm0 ; xor the initial crc value
460
461 ; shl tmp1, 4
462 lea rax, [pshufb_shf_table + 16]
463 sub rax, tmp1
464 movdqu xmm0, [rax]
465 pxor xmm0, [mask1]
466
467 pshufb xmm3, xmm0
468 jmp _128_done
469
470 align 16
471 _exact_16_left:
472 movdqu xmm3, [arg3]
473 movdqu [arg2], xmm3
474 pshufb xmm3, xmm7
475 pxor xmm3, xmm0 ; xor the initial crc value
476
477 jmp _128_done
478
479 _only_less_than_4:
480 cmp arg4, 3
481 jl _only_less_than_3
482
483 ; load 3 Bytes
484 mov al, [arg3]
485 mov [arg2], al
486 mov [r11], al
487
488 mov al, [arg3+1]
489 mov [arg2+1], al
490 mov [r11+1], al
491
492 mov al, [arg3+2]
493 mov [arg2+2], al
494 mov [r11+2], al
495
496 movdqa xmm3, [rsp]
497 pshufb xmm3, xmm7
498 pxor xmm3, xmm0 ; xor the initial crc value
499
500 psrldq xmm3, 5
501
502 jmp _barrett
503 _only_less_than_3:
504 cmp arg4, 2
505 jl _only_less_than_2
506
507 ; load 2 Bytes
508 mov al, [arg3]
509 mov [arg2], al
510 mov [r11], al
511
512 mov al, [arg3+1]
513 mov [arg2+1], al
514 mov [r11+1], al
515
516 movdqa xmm3, [rsp]
517 pshufb xmm3, xmm7
518 pxor xmm3, xmm0 ; xor the initial crc value
519
520 psrldq xmm3, 6
521
522 jmp _barrett
523 _only_less_than_2:
524
525 ; load 1 Byte
526 mov al, [arg3]
527 mov [arg2],al
528 mov [r11], al
529
530 movdqa xmm3, [rsp]
531 pshufb xmm3, xmm7
532 pxor xmm3, xmm0 ; xor the initial crc value
533
534 psrldq xmm3, 7
535
536 jmp _barrett
537
538 section .data
539
540 ; precomputed constants
541 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
542 align 16
543 ; Q = 0x18BB70000
544 ; rk1 = 2^(32*3) mod Q << 32
545 ; rk2 = 2^(32*5) mod Q << 32
546 ; rk3 = 2^(32*15) mod Q << 32
547 ; rk4 = 2^(32*17) mod Q << 32
548 ; rk5 = 2^(32*3) mod Q << 32
549 ; rk6 = 2^(32*2) mod Q << 32
550 ; rk7 = floor(2^64/Q)
551 ; rk8 = Q
552 rk1:
553 DQ 0x2d56000000000000
554 rk2:
555 DQ 0x06df000000000000
556 rk3:
557 DQ 0x044c000000000000
558 rk4:
559 DQ 0xe658000000000000
560 rk5:
561 DQ 0x2d56000000000000
562 rk6:
563 DQ 0x1368000000000000
564 rk7:
565 DQ 0x00000001f65a57f8
566 rk8:
567 DQ 0x000000018bb70000
568 mask1:
569 dq 0x8080808080808080, 0x8080808080808080
570 mask2:
571 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
572
573 SHUF_MASK:
574 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
575
576 pshufb_shf_table:
577 ; use these values for shift constants for the pshufb instruction
578 ; different alignments result in values as shown:
579 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
580 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
581 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
582 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
583 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
584 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
585 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
586 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
587 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
588 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
589 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
590 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
591 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
592 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
593 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
594 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
595 dq 0x0706050403020100, 0x000e0d0c0b0a0908
596
597 ;;; func core, ver, snum
598 slversion crc16_t10dif_copy_by4, 05, 02, 0000