]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_by4.asm
update sources to v12.1.1
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;
30 ; Function API:
31 ; UINT16 crc16_t10dif_by4(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
35 ; );
36 ;
37 ; Authors:
38 ; Erdinc Ozturk
39 ; Vinodh Gopal
40 ; James Guilford
41 ;
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44 ;
45
46 %include "reg_sizes.asm"
47
48 %define fetch_dist 1024
49
50 [bits 64]
51 default rel
52
53 section .text
54 %ifidn __OUTPUT_FORMAT__, win64
55 %xdefine arg1 rcx
56 %xdefine arg2 rdx
57 %xdefine arg3 r8
58
59 %xdefine arg1_low32 ecx
60 %else
61 %xdefine arg1 rdi
62 %xdefine arg2 rsi
63 %xdefine arg3 rdx
64
65 %xdefine arg1_low32 edi
66 %endif
67
68 align 16
69 global crc16_t10dif_by4:function
70 crc16_t10dif_by4:
71
72 ; adjust the 16-bit initial_crc value, scale it to 32 bits
73 shl arg1_low32, 16
74
75 ; After this point, code flow is exactly same as a 32-bit CRC.
76 ; The only difference is before returning eax, we will shift
77 ; it right 16 bits, to scale back to 16 bits.
78
79 sub rsp,16*4+8
80
81 ; push the xmm registers into the stack to maintain
82 movdqa [rsp+16*2],xmm6
83 movdqa [rsp+16*3],xmm7
84
85 ; check if smaller than 128B
86 cmp arg3, 128
87
88 ; for sizes less than 128, we can't fold 64B at a time...
89 jl _less_than_128
90
91
92 ; load the initial crc value
93 movd xmm6, arg1_low32 ; initial crc
94
95 ; crc value does not need to be byte-reflected, but it needs to
96 ; be moved to the high part of the register.
97 ; because data will be byte-reflected and will align with
98 ; initial crc at correct place.
99 pslldq xmm6, 12
100
101 movdqa xmm7, [SHUF_MASK]
102 ; receive the initial 64B data, xor the initial crc value
103 movdqu xmm0, [arg2]
104 movdqu xmm1, [arg2+16]
105 movdqu xmm2, [arg2+32]
106 movdqu xmm3, [arg2+48]
107
108 pshufb xmm0, xmm7
109 ; XOR the initial_crc value
110 pxor xmm0, xmm6
111 pshufb xmm1, xmm7
112 pshufb xmm2, xmm7
113 pshufb xmm3, xmm7
114
115 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
116 ;imm value of pclmulqdq instruction
117 ;will determine which constant to use
118 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
119 ; we subtract 128 instead of 64 to save one instruction from the loop
120 sub arg3, 128
121
122 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
123 ; buffer. The _fold_64_B_loop
124 ; loop will fold 64B at a time until we have 64+y Bytes of buffer
125
126
127 ; fold 64B at a time. This section of the code folds 4 xmm
128 ; registers in parallel
129 _fold_64_B_loop:
130
131 ; update the buffer pointer
132 add arg2, 64 ; buf += 64;
133
134 prefetchnta [arg2+fetch_dist+0]
135 movdqu xmm4, xmm0
136 movdqu xmm5, xmm1
137
138 pclmulqdq xmm0, xmm6 , 0x11
139 pclmulqdq xmm1, xmm6 , 0x11
140
141 pclmulqdq xmm4, xmm6, 0x0
142 pclmulqdq xmm5, xmm6, 0x0
143
144 pxor xmm0, xmm4
145 pxor xmm1, xmm5
146
147 prefetchnta [arg2+fetch_dist+32]
148 movdqu xmm4, xmm2
149 movdqu xmm5, xmm3
150
151 pclmulqdq xmm2, xmm6, 0x11
152 pclmulqdq xmm3, xmm6, 0x11
153
154 pclmulqdq xmm4, xmm6, 0x0
155 pclmulqdq xmm5, xmm6, 0x0
156
157 pxor xmm2, xmm4
158 pxor xmm3, xmm5
159
160 movdqu xmm4, [arg2]
161 movdqu xmm5, [arg2+16]
162 pshufb xmm4, xmm7
163 pshufb xmm5, xmm7
164 pxor xmm0, xmm4
165 pxor xmm1, xmm5
166
167 movdqu xmm4, [arg2+32]
168 movdqu xmm5, [arg2+48]
169 pshufb xmm4, xmm7
170 pshufb xmm5, xmm7
171
172 pxor xmm2, xmm4
173 pxor xmm3, xmm5
174
175 sub arg3, 64
176
177 ; check if there is another 64B in the buffer to be able to fold
178 jge _fold_64_B_loop
179 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
180
181
182 add arg2, 64
183 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
184 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
185
186
187 ; fold the 4 xmm registers to 1 xmm register with different constants
188
189 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
190 ;imm value of pclmulqdq instruction will
191 ;determine which constant to use
192
193 movdqa xmm4, xmm0
194 pclmulqdq xmm0, xmm6, 0x11
195 pclmulqdq xmm4, xmm6, 0x0
196 pxor xmm1, xmm4
197 pxor xmm1, xmm0
198
199 movdqa xmm4, xmm1
200 pclmulqdq xmm1, xmm6, 0x11
201 pclmulqdq xmm4, xmm6, 0x0
202 pxor xmm2, xmm4
203 pxor xmm2, xmm1
204
205 movdqa xmm4, xmm2
206 pclmulqdq xmm2, xmm6, 0x11
207 pclmulqdq xmm4, xmm6, 0x0
208 pxor xmm3, xmm4
209 pxor xmm3, xmm2
210
211
212 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
213 ; instead of a cmp instruction, we use the negative flag with the jl instruction
214 add arg3, 64-16
215 jl _final_reduction_for_128
216
217 ; now we have 16+y bytes left to reduce. 16 Bytes
218 ; is in register xmm3 and the rest is in memory
219 ; we can fold 16 bytes at a time if y>=16
220 ; continue folding 16B at a time
221
222 _16B_reduction_loop:
223 movdqa xmm4, xmm3
224 pclmulqdq xmm3, xmm6, 0x11
225 pclmulqdq xmm4, xmm6, 0x0
226 pxor xmm3, xmm4
227 movdqu xmm0, [arg2]
228 pshufb xmm0, xmm7
229 pxor xmm3, xmm0
230 add arg2, 16
231 sub arg3, 16
232 ; instead of a cmp instruction, we utilize the flags with the jge instruction
233 ; equivalent of: cmp arg3, 16-16
234 ; check if there is any more 16B in the buffer to be able to fold
235 jge _16B_reduction_loop
236
237 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
238 ;first, we reduce the data in the xmm3 register
239
240
241 _final_reduction_for_128:
242 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
243 add arg3, 16
244 je _128_done
245
246 ; here we are getting data that is less than 16 bytes.
247 ; since we know that there was data before the pointer,
248 ; we can offset the input pointer before the actual point,
249 ; to receive exactly 16 bytes.
250 ; after that the registers need to be adjusted.
251 _get_last_two_xmms:
252 movdqa xmm2, xmm3
253
254 movdqu xmm1, [arg2 - 16 + arg3]
255 pshufb xmm1, xmm7
256
257 ; get rid of the extra data that was loaded before
258 ; load the shift constant
259 lea rax, [pshufb_shf_table + 16]
260 sub rax, arg3
261 movdqu xmm0, [rax]
262
263 ; shift xmm2 to the left by arg3 bytes
264 pshufb xmm2, xmm0
265
266 ; shift xmm3 to the right by 16-arg3 bytes
267 pxor xmm0, [mask1]
268 pshufb xmm3, xmm0
269 pblendvb xmm1, xmm2 ;xmm0 is implicit
270
271 ; fold 16 Bytes
272 movdqa xmm2, xmm1
273 movdqa xmm4, xmm3
274 pclmulqdq xmm3, xmm6, 0x11
275 pclmulqdq xmm4, xmm6, 0x0
276 pxor xmm3, xmm4
277 pxor xmm3, xmm2
278
279 _128_done:
280 ; compute crc of a 128-bit value
281 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
282 movdqa xmm0, xmm3
283
284 ;64b fold
285 pclmulqdq xmm3, xmm6, 0x1
286 pslldq xmm0, 8
287 pxor xmm3, xmm0
288
289 ;32b fold
290 movdqa xmm0, xmm3
291
292 pand xmm0, [mask2]
293
294 psrldq xmm3, 12
295 pclmulqdq xmm3, xmm6, 0x10
296 pxor xmm3, xmm0
297
298 ;barrett reduction
299 _barrett:
300 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
301 movdqa xmm0, xmm3
302 pclmulqdq xmm3, xmm6, 0x01
303 pslldq xmm3, 4
304 pclmulqdq xmm3, xmm6, 0x11
305
306 pslldq xmm3, 4
307 pxor xmm3, xmm0
308 pextrd eax, xmm3,1
309
310 _cleanup:
311 ; scale the result back to 16 bits
312 shr eax, 16
313 movdqa xmm6, [rsp+16*2]
314 movdqa xmm7, [rsp+16*3]
315 add rsp,16*4+8
316 ret
317
318
319 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
320 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
321 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
322 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
323
324 align 16
325 _less_than_128:
326
327 ; check if there is enough buffer to be able to fold 16B at a time
328 cmp arg3, 32
329 jl _less_than_32
330 movdqa xmm7, [SHUF_MASK]
331
332 ; if there is, load the constants
333 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
334
335 movd xmm0, arg1_low32 ; get the initial crc value
336 pslldq xmm0, 12 ; align it to its correct place
337 movdqu xmm3, [arg2] ; load the plaintext
338 pshufb xmm3, xmm7 ; byte-reflect the plaintext
339 pxor xmm3, xmm0
340
341
342 ; update the buffer pointer
343 add arg2, 16
344
345 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
346 sub arg3, 32
347
348 jmp _16B_reduction_loop
349
350
351 align 16
352 _less_than_32:
353 ; mov initial crc to the return value. this is necessary for zero-length buffers.
354 mov eax, arg1_low32
355 test arg3, arg3
356 je _cleanup
357
358 movdqa xmm7, [SHUF_MASK]
359
360 movd xmm0, arg1_low32 ; get the initial crc value
361 pslldq xmm0, 12 ; align it to its correct place
362
363 cmp arg3, 16
364 je _exact_16_left
365 jl _less_than_16_left
366
367 movdqu xmm3, [arg2] ; load the plaintext
368 pshufb xmm3, xmm7 ; byte-reflect the plaintext
369 pxor xmm3, xmm0 ; xor the initial crc value
370 add arg2, 16
371 sub arg3, 16
372 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
373 jmp _get_last_two_xmms
374
375
376 align 16
377 _less_than_16_left:
378 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
379
380 pxor xmm1, xmm1
381 mov r11, rsp
382 movdqa [r11], xmm1
383
384 cmp arg3, 4
385 jl _only_less_than_4
386
387 ; backup the counter value
388 mov r9, arg3
389 cmp arg3, 8
390 jl _less_than_8_left
391
392 ; load 8 Bytes
393 mov rax, [arg2]
394 mov [r11], rax
395 add r11, 8
396 sub arg3, 8
397 add arg2, 8
398 _less_than_8_left:
399
400 cmp arg3, 4
401 jl _less_than_4_left
402
403 ; load 4 Bytes
404 mov eax, [arg2]
405 mov [r11], eax
406 add r11, 4
407 sub arg3, 4
408 add arg2, 4
409 _less_than_4_left:
410
411 cmp arg3, 2
412 jl _less_than_2_left
413
414 ; load 2 Bytes
415 mov ax, [arg2]
416 mov [r11], ax
417 add r11, 2
418 sub arg3, 2
419 add arg2, 2
420 _less_than_2_left:
421 cmp arg3, 1
422 jl _zero_left
423
424 ; load 1 Byte
425 mov al, [arg2]
426 mov [r11], al
427 _zero_left:
428 movdqa xmm3, [rsp]
429 pshufb xmm3, xmm7
430 pxor xmm3, xmm0 ; xor the initial crc value
431
432 ; shl r9, 4
433 lea rax, [pshufb_shf_table + 16]
434 sub rax, r9
435 movdqu xmm0, [rax]
436 pxor xmm0, [mask1]
437
438 pshufb xmm3, xmm0
439 jmp _128_done
440
441 align 16
442 _exact_16_left:
443 movdqu xmm3, [arg2]
444 pshufb xmm3, xmm7
445 pxor xmm3, xmm0 ; xor the initial crc value
446
447 jmp _128_done
448
449 _only_less_than_4:
450 cmp arg3, 3
451 jl _only_less_than_3
452
453 ; load 3 Bytes
454 mov al, [arg2]
455 mov [r11], al
456
457 mov al, [arg2+1]
458 mov [r11+1], al
459
460 mov al, [arg2+2]
461 mov [r11+2], al
462
463 movdqa xmm3, [rsp]
464 pshufb xmm3, xmm7
465 pxor xmm3, xmm0 ; xor the initial crc value
466
467 psrldq xmm3, 5
468
469 jmp _barrett
470 _only_less_than_3:
471 cmp arg3, 2
472 jl _only_less_than_2
473
474 ; load 2 Bytes
475 mov al, [arg2]
476 mov [r11], al
477
478 mov al, [arg2+1]
479 mov [r11+1], al
480
481 movdqa xmm3, [rsp]
482 pshufb xmm3, xmm7
483 pxor xmm3, xmm0 ; xor the initial crc value
484
485 psrldq xmm3, 6
486
487 jmp _barrett
488 _only_less_than_2:
489
490 ; load 1 Byte
491 mov al, [arg2]
492 mov [r11], al
493
494 movdqa xmm3, [rsp]
495 pshufb xmm3, xmm7
496 pxor xmm3, xmm0 ; xor the initial crc value
497
498 psrldq xmm3, 7
499
500 jmp _barrett
501
502 section .data
503
504 ; precomputed constants
505 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
506 align 16
507 ; Q = 0x18BB70000
508 ; rk1 = 2^(32*3) mod Q << 32
509 ; rk2 = 2^(32*5) mod Q << 32
510 ; rk3 = 2^(32*15) mod Q << 32
511 ; rk4 = 2^(32*17) mod Q << 32
512 ; rk5 = 2^(32*3) mod Q << 32
513 ; rk6 = 2^(32*2) mod Q << 32
514 ; rk7 = floor(2^64/Q)
515 ; rk8 = Q
516 rk1:
517 DQ 0x2d56000000000000
518 rk2:
519 DQ 0x06df000000000000
520 rk3:
521 DQ 0x044c000000000000
522 rk4:
523 DQ 0xe658000000000000
524 rk5:
525 DQ 0x2d56000000000000
526 rk6:
527 DQ 0x1368000000000000
528 rk7:
529 DQ 0x00000001f65a57f8
530 rk8:
531 DQ 0x000000018bb70000
532 mask1:
533 dq 0x8080808080808080, 0x8080808080808080
534 mask2:
535 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
536
537 SHUF_MASK:
538 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
539
540 pshufb_shf_table:
541 ; use these values for shift constants for the pshufb instruction
542 ; different alignments result in values as shown:
543 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
544 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
545 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
546 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
547 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
548 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
549 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
550 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
551 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
552 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
553 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
554 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
555 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
556 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
557 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
558 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
559 dq 0x0706050403020100, 0x000e0d0c0b0a0908
560
561 ;;; func core, ver, snum
562 slversion crc16_t10dif_by4, 05, 02, 0016