]> git.proxmox.com Git - ceph.git/blob - ceph/src/isa-l/crc/crc16_t10dif_by4.asm
16129fd044459f14b7e0bcf105ed5ec0a83aff10
[ceph.git] / ceph / src / isa-l / crc / crc16_t10dif_by4.asm
1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
3 ;
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
6 ; are met:
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
12 ; distribution.
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
16 ;
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29 ;
30 ; Function API:
31 ; UINT16 crc16_t10dif_by4(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
35 ; );
36 ;
37 ; Authors:
38 ; Erdinc Ozturk
39 ; Vinodh Gopal
40 ; James Guilford
41 ;
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://download.intel.com/design/intarch/papers/323102.pdf
44 ;
45
46 %include "reg_sizes.asm"
47
48 [bits 64]
49 default rel
50
51 section .text
52 %ifidn __OUTPUT_FORMAT__, win64
53 %xdefine arg1 rcx
54 %xdefine arg2 rdx
55 %xdefine arg3 r8
56
57 %xdefine arg1_low32 ecx
58 %else
59 %xdefine arg1 rdi
60 %xdefine arg2 rsi
61 %xdefine arg3 rdx
62
63 %xdefine arg1_low32 edi
64 %endif
65
66 align 16
67 global crc16_t10dif_by4:function
68 crc16_t10dif_by4:
69
70 ; adjust the 16-bit initial_crc value, scale it to 32 bits
71 shl arg1_low32, 16
72
73 ; After this point, code flow is exactly same as a 32-bit CRC.
74 ; The only difference is before returning eax, we will shift
75 ; it right 16 bits, to scale back to 16 bits.
76
77 sub rsp,16*4+8
78
79 ; push the xmm registers into the stack to maintain
80 movdqa [rsp+16*2],xmm6
81 movdqa [rsp+16*3],xmm7
82
83 ; check if smaller than 128B
84 cmp arg3, 128
85
86 ; for sizes less than 128, we can't fold 64B at a time...
87 jl _less_than_128
88
89
90 ; load the initial crc value
91 movd xmm6, arg1_low32 ; initial crc
92
93 ; crc value does not need to be byte-reflected, but it needs to
94 ; be moved to the high part of the register.
95 ; because data will be byte-reflected and will align with
96 ; initial crc at correct place.
97 pslldq xmm6, 12
98
99 movdqa xmm7, [SHUF_MASK]
100 ; receive the initial 64B data, xor the initial crc value
101 movdqu xmm0, [arg2]
102 movdqu xmm1, [arg2+16]
103 movdqu xmm2, [arg2+32]
104 movdqu xmm3, [arg2+48]
105
106 pshufb xmm0, xmm7
107 ; XOR the initial_crc value
108 pxor xmm0, xmm6
109 pshufb xmm1, xmm7
110 pshufb xmm2, xmm7
111 pshufb xmm3, xmm7
112
113 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4
114 ;imm value of pclmulqdq instruction
115 ;will determine which constant to use
116 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
117 ; we subtract 128 instead of 64 to save one instruction from the loop
118 sub arg3, 128
119
120 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of
121 ; buffer. The _fold_64_B_loop
122 ; loop will fold 64B at a time until we have 64+y Bytes of buffer
123
124
125 ; fold 64B at a time. This section of the code folds 4 xmm
126 ; registers in parallel
127 _fold_64_B_loop:
128
129 ; update the buffer pointer
130 add arg2, 64 ; buf += 64;
131
132 movdqu xmm4, xmm0
133 movdqu xmm5, xmm1
134
135 pclmulqdq xmm0, xmm6 , 0x11
136 pclmulqdq xmm1, xmm6 , 0x11
137
138 pclmulqdq xmm4, xmm6, 0x0
139 pclmulqdq xmm5, xmm6, 0x0
140
141 pxor xmm0, xmm4
142 pxor xmm1, xmm5
143
144 movdqu xmm4, xmm2
145 movdqu xmm5, xmm3
146
147 pclmulqdq xmm2, xmm6, 0x11
148 pclmulqdq xmm3, xmm6, 0x11
149
150 pclmulqdq xmm4, xmm6, 0x0
151 pclmulqdq xmm5, xmm6, 0x0
152
153 pxor xmm2, xmm4
154 pxor xmm3, xmm5
155
156 movdqu xmm4, [arg2]
157 movdqu xmm5, [arg2+16]
158 pshufb xmm4, xmm7
159 pshufb xmm5, xmm7
160 pxor xmm0, xmm4
161 pxor xmm1, xmm5
162
163 movdqu xmm4, [arg2+32]
164 movdqu xmm5, [arg2+48]
165 pshufb xmm4, xmm7
166 pshufb xmm5, xmm7
167
168 pxor xmm2, xmm4
169 pxor xmm3, xmm5
170
171 sub arg3, 64
172
173 ; check if there is another 64B in the buffer to be able to fold
174 jge _fold_64_B_loop
175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
176
177
178 add arg2, 64
179 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
180 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
181
182
183 ; fold the 4 xmm registers to 1 xmm register with different constants
184
185 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2
186 ;imm value of pclmulqdq instruction will
187 ;determine which constant to use
188
189 movdqa xmm4, xmm0
190 pclmulqdq xmm0, xmm6, 0x11
191 pclmulqdq xmm4, xmm6, 0x0
192 pxor xmm1, xmm4
193 pxor xmm1, xmm0
194
195 movdqa xmm4, xmm1
196 pclmulqdq xmm1, xmm6, 0x11
197 pclmulqdq xmm4, xmm6, 0x0
198 pxor xmm2, xmm4
199 pxor xmm2, xmm1
200
201 movdqa xmm4, xmm2
202 pclmulqdq xmm2, xmm6, 0x11
203 pclmulqdq xmm4, xmm6, 0x0
204 pxor xmm3, xmm4
205 pxor xmm3, xmm2
206
207
208 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
209 ; instead of a cmp instruction, we use the negative flag with the jl instruction
210 add arg3, 64-16
211 jl _final_reduction_for_128
212
213 ; now we have 16+y bytes left to reduce. 16 Bytes
214 ; is in register xmm3 and the rest is in memory
215 ; we can fold 16 bytes at a time if y>=16
216 ; continue folding 16B at a time
217
218 _16B_reduction_loop:
219 movdqa xmm4, xmm3
220 pclmulqdq xmm3, xmm6, 0x11
221 pclmulqdq xmm4, xmm6, 0x0
222 pxor xmm3, xmm4
223 movdqu xmm0, [arg2]
224 pshufb xmm0, xmm7
225 pxor xmm3, xmm0
226 add arg2, 16
227 sub arg3, 16
228 ; instead of a cmp instruction, we utilize the flags with the jge instruction
229 ; equivalent of: cmp arg3, 16-16
230 ; check if there is any more 16B in the buffer to be able to fold
231 jge _16B_reduction_loop
232
233 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
234 ;first, we reduce the data in the xmm3 register
235
236
237 _final_reduction_for_128:
238 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
239 add arg3, 16
240 je _128_done
241
242 ; here we are getting data that is less than 16 bytes.
243 ; since we know that there was data before the pointer,
244 ; we can offset the input pointer before the actual point,
245 ; to receive exactly 16 bytes.
246 ; after that the registers need to be adjusted.
247 _get_last_two_xmms:
248 movdqa xmm2, xmm3
249
250 movdqu xmm1, [arg2 - 16 + arg3]
251 pshufb xmm1, xmm7
252
253 ; get rid of the extra data that was loaded before
254 ; load the shift constant
255 lea rax, [pshufb_shf_table + 16]
256 sub rax, arg3
257 movdqu xmm0, [rax]
258
259 ; shift xmm2 to the left by arg3 bytes
260 pshufb xmm2, xmm0
261
262 ; shift xmm3 to the right by 16-arg3 bytes
263 pxor xmm0, [mask1]
264 pshufb xmm3, xmm0
265 pblendvb xmm1, xmm2 ;xmm0 is implicit
266
267 ; fold 16 Bytes
268 movdqa xmm2, xmm1
269 movdqa xmm4, xmm3
270 pclmulqdq xmm3, xmm6, 0x11
271 pclmulqdq xmm4, xmm6, 0x0
272 pxor xmm3, xmm4
273 pxor xmm3, xmm2
274
275 _128_done:
276 ; compute crc of a 128-bit value
277 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6
278 movdqa xmm0, xmm3
279
280 ;64b fold
281 pclmulqdq xmm3, xmm6, 0x1
282 pslldq xmm0, 8
283 pxor xmm3, xmm0
284
285 ;32b fold
286 movdqa xmm0, xmm3
287
288 pand xmm0, [mask2]
289
290 psrldq xmm3, 12
291 pclmulqdq xmm3, xmm6, 0x10
292 pxor xmm3, xmm0
293
294 ;barrett reduction
295 _barrett:
296 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6
297 movdqa xmm0, xmm3
298 pclmulqdq xmm3, xmm6, 0x01
299 pslldq xmm3, 4
300 pclmulqdq xmm3, xmm6, 0x11
301
302 pslldq xmm3, 4
303 pxor xmm3, xmm0
304 pextrd eax, xmm3,1
305
306 _cleanup:
307 ; scale the result back to 16 bits
308 shr eax, 16
309 movdqa xmm6, [rsp+16*2]
310 movdqa xmm7, [rsp+16*3]
311 add rsp,16*4+8
312 ret
313
314
315 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
316 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
317 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
318 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
319
320 align 16
321 _less_than_128:
322
323 ; check if there is enough buffer to be able to fold 16B at a time
324 cmp arg3, 32
325 jl _less_than_32
326 movdqa xmm7, [SHUF_MASK]
327
328 ; if there is, load the constants
329 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
330
331 movd xmm0, arg1_low32 ; get the initial crc value
332 pslldq xmm0, 12 ; align it to its correct place
333 movdqu xmm3, [arg2] ; load the plaintext
334 pshufb xmm3, xmm7 ; byte-reflect the plaintext
335 pxor xmm3, xmm0
336
337
338 ; update the buffer pointer
339 add arg2, 16
340
341 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
342 sub arg3, 32
343
344 jmp _16B_reduction_loop
345
346
347 align 16
348 _less_than_32:
349 ; mov initial crc to the return value. this is necessary for zero-length buffers.
350 mov eax, arg1_low32
351 test arg3, arg3
352 je _cleanup
353
354 movdqa xmm7, [SHUF_MASK]
355
356 movd xmm0, arg1_low32 ; get the initial crc value
357 pslldq xmm0, 12 ; align it to its correct place
358
359 cmp arg3, 16
360 je _exact_16_left
361 jl _less_than_16_left
362
363 movdqu xmm3, [arg2] ; load the plaintext
364 pshufb xmm3, xmm7 ; byte-reflect the plaintext
365 pxor xmm3, xmm0 ; xor the initial crc value
366 add arg2, 16
367 sub arg3, 16
368 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6
369 jmp _get_last_two_xmms
370
371
372 align 16
373 _less_than_16_left:
374 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
375
376 pxor xmm1, xmm1
377 mov r11, rsp
378 movdqa [r11], xmm1
379
380 cmp arg3, 4
381 jl _only_less_than_4
382
383 ; backup the counter value
384 mov r9, arg3
385 cmp arg3, 8
386 jl _less_than_8_left
387
388 ; load 8 Bytes
389 mov rax, [arg2]
390 mov [r11], rax
391 add r11, 8
392 sub arg3, 8
393 add arg2, 8
394 _less_than_8_left:
395
396 cmp arg3, 4
397 jl _less_than_4_left
398
399 ; load 4 Bytes
400 mov eax, [arg2]
401 mov [r11], eax
402 add r11, 4
403 sub arg3, 4
404 add arg2, 4
405 _less_than_4_left:
406
407 cmp arg3, 2
408 jl _less_than_2_left
409
410 ; load 2 Bytes
411 mov ax, [arg2]
412 mov [r11], ax
413 add r11, 2
414 sub arg3, 2
415 add arg2, 2
416 _less_than_2_left:
417 cmp arg3, 1
418 jl _zero_left
419
420 ; load 1 Byte
421 mov al, [arg2]
422 mov [r11], al
423 _zero_left:
424 movdqa xmm3, [rsp]
425 pshufb xmm3, xmm7
426 pxor xmm3, xmm0 ; xor the initial crc value
427
428 ; shl r9, 4
429 lea rax, [pshufb_shf_table + 16]
430 sub rax, r9
431 movdqu xmm0, [rax]
432 pxor xmm0, [mask1]
433
434 pshufb xmm3, xmm0
435 jmp _128_done
436
437 align 16
438 _exact_16_left:
439 movdqu xmm3, [arg2]
440 pshufb xmm3, xmm7
441 pxor xmm3, xmm0 ; xor the initial crc value
442
443 jmp _128_done
444
445 _only_less_than_4:
446 cmp arg3, 3
447 jl _only_less_than_3
448
449 ; load 3 Bytes
450 mov al, [arg2]
451 mov [r11], al
452
453 mov al, [arg2+1]
454 mov [r11+1], al
455
456 mov al, [arg2+2]
457 mov [r11+2], al
458
459 movdqa xmm3, [rsp]
460 pshufb xmm3, xmm7
461 pxor xmm3, xmm0 ; xor the initial crc value
462
463 psrldq xmm3, 5
464
465 jmp _barrett
466 _only_less_than_3:
467 cmp arg3, 2
468 jl _only_less_than_2
469
470 ; load 2 Bytes
471 mov al, [arg2]
472 mov [r11], al
473
474 mov al, [arg2+1]
475 mov [r11+1], al
476
477 movdqa xmm3, [rsp]
478 pshufb xmm3, xmm7
479 pxor xmm3, xmm0 ; xor the initial crc value
480
481 psrldq xmm3, 6
482
483 jmp _barrett
484 _only_less_than_2:
485
486 ; load 1 Byte
487 mov al, [arg2]
488 mov [r11], al
489
490 movdqa xmm3, [rsp]
491 pshufb xmm3, xmm7
492 pxor xmm3, xmm0 ; xor the initial crc value
493
494 psrldq xmm3, 7
495
496 jmp _barrett
497
498 section .data
499
500 ; precomputed constants
501 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
502 align 16
503 ; Q = 0x18BB70000
504 ; rk1 = 2^(32*3) mod Q << 32
505 ; rk2 = 2^(32*5) mod Q << 32
506 ; rk3 = 2^(32*15) mod Q << 32
507 ; rk4 = 2^(32*17) mod Q << 32
508 ; rk5 = 2^(32*3) mod Q << 32
509 ; rk6 = 2^(32*2) mod Q << 32
510 ; rk7 = floor(2^64/Q)
511 ; rk8 = Q
512 rk1:
513 DQ 0x2d56000000000000
514 rk2:
515 DQ 0x06df000000000000
516 rk3:
517 DQ 0x044c000000000000
518 rk4:
519 DQ 0xe658000000000000
520 rk5:
521 DQ 0x2d56000000000000
522 rk6:
523 DQ 0x1368000000000000
524 rk7:
525 DQ 0x00000001f65a57f8
526 rk8:
527 DQ 0x000000018bb70000
528 mask1:
529 dq 0x8080808080808080, 0x8080808080808080
530 mask2:
531 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
532
533 SHUF_MASK:
534 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
535
536 pshufb_shf_table:
537 ; use these values for shift constants for the pshufb instruction
538 ; different alignments result in values as shown:
539 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
540 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
541 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
542 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
543 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
544 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
545 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
546 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
547 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
548 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
549 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
550 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
551 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
552 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
553 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
554 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
555 dq 0x0706050403020100, 0x000e0d0c0b0a0908
556
557 ;;; func core, ver, snum
558 slversion crc16_t10dif_by4, 05, 02, 0016