1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2015 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT16 crc16_t10dif_01(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 %include "reg_sizes.asm"
47 %define fetch_dist 1024
54 %ifidn __OUTPUT_FORMAT__, win64
59 %xdefine arg1_low32 ecx
65 %xdefine arg1_low32 edi
68 %ifidn __OUTPUT_FORMAT__, win64
70 %define VARIABLE_OFFSET 16*10+8
72 %define VARIABLE_OFFSET 16*2+8
76 global crc16_t10dif_01:function
79 ; adjust the 16-bit initial_crc value, scale it to 32 bits
82 ; After this point, code flow is exactly same as a 32-bit CRC.
83 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
85 sub rsp, VARIABLE_OFFSET
86 %ifidn __OUTPUT_FORMAT__, win64
87 ; push the xmm registers into the stack to maintain
88 movdqa [rsp+16*2],xmm6
89 movdqa [rsp+16*3],xmm7
90 movdqa [rsp+16*4],xmm8
91 movdqa [rsp+16*5],xmm9
92 movdqa [rsp+16*6],xmm10
93 movdqa [rsp+16*7],xmm11
94 movdqa [rsp+16*8],xmm12
95 movdqa [rsp+16*9],xmm13
98 ; check if smaller than 256
101 ; for sizes less than 256, we can't fold 128B at a time...
105 ; load the initial crc value
106 movd xmm10, arg1_low32 ; initial crc
108 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
109 ; because data will be byte-reflected and will align with initial crc at correct place.
112 movdqa xmm11, [SHUF_MASK]
113 ; receive the initial 128B data, xor the initial crc value
114 movdqu xmm0, [arg2+16*0]
115 movdqu xmm1, [arg2+16*1]
116 movdqu xmm2, [arg2+16*2]
117 movdqu xmm3, [arg2+16*3]
118 movdqu xmm4, [arg2+16*4]
119 movdqu xmm5, [arg2+16*5]
120 movdqu xmm6, [arg2+16*6]
121 movdqu xmm7, [arg2+16*7]
124 ; XOR the initial_crc value
134 movdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
135 ;imm value of pclmulqdq instruction will determine which constant to use
136 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137 ; we subtract 256 instead of 128 to save one instruction from the loop
140 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
141 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
144 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
147 ; update the buffer pointer
148 add arg2, 128 ; buf += 128;
150 prefetchnta [arg2+fetch_dist+0]
151 movdqu xmm9, [arg2+16*0]
152 movdqu xmm12, [arg2+16*1]
157 pclmulqdq xmm0, xmm10, 0x0
158 pclmulqdq xmm8, xmm10 , 0x11
159 pclmulqdq xmm1, xmm10, 0x0
160 pclmulqdq xmm13, xmm10 , 0x11
166 prefetchnta [arg2+fetch_dist+32]
167 movdqu xmm9, [arg2+16*2]
168 movdqu xmm12, [arg2+16*3]
173 pclmulqdq xmm2, xmm10, 0x0
174 pclmulqdq xmm8, xmm10 , 0x11
175 pclmulqdq xmm3, xmm10, 0x0
176 pclmulqdq xmm13, xmm10 , 0x11
182 prefetchnta [arg2+fetch_dist+64]
183 movdqu xmm9, [arg2+16*4]
184 movdqu xmm12, [arg2+16*5]
189 pclmulqdq xmm4, xmm10, 0x0
190 pclmulqdq xmm8, xmm10 , 0x11
191 pclmulqdq xmm5, xmm10, 0x0
192 pclmulqdq xmm13, xmm10 , 0x11
198 prefetchnta [arg2+fetch_dist+96]
199 movdqu xmm9, [arg2+16*6]
200 movdqu xmm12, [arg2+16*7]
205 pclmulqdq xmm6, xmm10, 0x0
206 pclmulqdq xmm8, xmm10 , 0x11
207 pclmulqdq xmm7, xmm10, 0x0
208 pclmulqdq xmm13, xmm10 , 0x11
216 ; check if there is another 128B in the buffer to be able to fold
218 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
223 ; fold the 8 xmm registers to 1 xmm register with different constants
227 pclmulqdq xmm0, xmm10, 0x11
228 pclmulqdq xmm8, xmm10, 0x0
234 pclmulqdq xmm1, xmm10, 0x11
235 pclmulqdq xmm8, xmm10, 0x0
241 pclmulqdq xmm2, xmm10, 0x11
242 pclmulqdq xmm8, xmm10, 0x0
248 pclmulqdq xmm3, xmm10, 0x11
249 pclmulqdq xmm8, xmm10, 0x0
255 pclmulqdq xmm4, xmm10, 0x11
256 pclmulqdq xmm8, xmm10, 0x0
262 pclmulqdq xmm5, xmm10, 0x11
263 pclmulqdq xmm8, xmm10, 0x0
267 movdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
268 ;imm value of pclmulqdq instruction will determine which constant to use
270 pclmulqdq xmm6, xmm10, 0x11
271 pclmulqdq xmm8, xmm10, 0x0
276 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277 ; instead of a cmp instruction, we use the negative flag with the jl instruction
279 jl _final_reduction_for_128
281 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282 ; we can fold 16 bytes at a time if y>=16
283 ; continue folding 16B at a time
287 pclmulqdq xmm7, xmm10, 0x11
288 pclmulqdq xmm8, xmm10, 0x0
295 ; instead of a cmp instruction, we utilize the flags with the jge instruction
296 ; equivalent of: cmp arg3, 16-16
297 ; check if there is any more 16B in the buffer to be able to fold
298 jge _16B_reduction_loop
300 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
301 ;first, we reduce the data in the xmm7 register
304 _final_reduction_for_128:
305 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
309 ; here we are getting data that is less than 16 bytes.
310 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311 ; after that the registers need to be adjusted.
315 movdqu xmm1, [arg2 - 16 + arg3]
318 ; get rid of the extra data that was loaded before
319 ; load the shift constant
320 lea rax, [pshufb_shf_table + 16]
324 ; shift xmm2 to the left by arg3 bytes
327 ; shift xmm7 to the right by 16-arg3 bytes
330 pblendvb xmm1, xmm2 ;xmm0 is implicit
335 pclmulqdq xmm7, xmm10, 0x11
336 pclmulqdq xmm8, xmm10, 0x0
341 ; compute crc of a 128-bit value
342 movdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
346 pclmulqdq xmm7, xmm10, 0x1
356 pclmulqdq xmm7, xmm10, 0x10
361 movdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
363 pclmulqdq xmm7, xmm10, 0x01
365 pclmulqdq xmm7, xmm10, 0x11
372 ; scale the result back to 16 bits
374 %ifidn __OUTPUT_FORMAT__, win64
375 movdqa xmm6, [rsp+16*2]
376 movdqa xmm7, [rsp+16*3]
377 movdqa xmm8, [rsp+16*4]
378 movdqa xmm9, [rsp+16*5]
379 movdqa xmm10, [rsp+16*6]
380 movdqa xmm11, [rsp+16*7]
381 movdqa xmm12, [rsp+16*8]
382 movdqa xmm13, [rsp+16*9]
384 add rsp, VARIABLE_OFFSET
388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
396 ; check if there is enough buffer to be able to fold 16B at a time
399 movdqa xmm11, [SHUF_MASK]
401 ; if there is, load the constants
402 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
404 movd xmm0, arg1_low32 ; get the initial crc value
405 pslldq xmm0, 12 ; align it to its correct place
406 movdqu xmm7, [arg2] ; load the plaintext
407 pshufb xmm7, xmm11 ; byte-reflect the plaintext
411 ; update the buffer pointer
414 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
417 jmp _16B_reduction_loop
422 ; mov initial crc to the return value. this is necessary for zero-length buffers.
427 movdqa xmm11, [SHUF_MASK]
429 movd xmm0, arg1_low32 ; get the initial crc value
430 pslldq xmm0, 12 ; align it to its correct place
434 jl _less_than_16_left
436 movdqu xmm7, [arg2] ; load the plaintext
437 pshufb xmm7, xmm11 ; byte-reflect the plaintext
438 pxor xmm7, xmm0 ; xor the initial crc value
441 movdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
442 jmp _get_last_two_xmms
447 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
456 ; backup the counter value
499 pxor xmm7, xmm0 ; xor the initial crc value
501 lea rax, [pshufb_shf_table + 16]
513 pxor xmm7, xmm0 ; xor the initial crc value
533 pxor xmm7, xmm0 ; xor the initial crc value
551 pxor xmm7, xmm0 ; xor the initial crc value
564 pxor xmm7, xmm0 ; xor the initial crc value
572 ; precomputed constants
573 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
576 ; rk1 = 2^(32*3) mod Q << 32
577 ; rk2 = 2^(32*5) mod Q << 32
578 ; rk3 = 2^(32*15) mod Q << 32
579 ; rk4 = 2^(32*17) mod Q << 32
580 ; rk5 = 2^(32*3) mod Q << 32
581 ; rk6 = 2^(32*2) mod Q << 32
582 ; rk7 = floor(2^64/Q)
585 DQ 0x2d56000000000000
587 DQ 0x06df000000000000
589 DQ 0x9d9d000000000000
591 DQ 0x7cf5000000000000
593 DQ 0x2d56000000000000
595 DQ 0x1368000000000000
597 DQ 0x00000001f65a57f8
599 DQ 0x000000018bb70000
602 DQ 0xceae000000000000
604 DQ 0xbfd6000000000000
606 DQ 0x1e16000000000000
608 DQ 0x713c000000000000
610 DQ 0xf7f9000000000000
612 DQ 0x80a6000000000000
614 DQ 0x044c000000000000
616 DQ 0xe658000000000000
618 DQ 0xad18000000000000
620 DQ 0xa497000000000000
622 DQ 0x6ee3000000000000
624 DQ 0xe7b5000000000000
635 dq 0x8080808080808080, 0x8080808080808080
637 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
640 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
643 ; use these values for shift constants for the pshufb instruction
644 ; different alignments result in values as shown:
645 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
646 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
647 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
648 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
649 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
650 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
651 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
652 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
653 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
654 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
655 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
656 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
657 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
658 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
659 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
660 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
661 dq 0x0706050403020100, 0x000e0d0c0b0a0908
663 ;;; func core, ver, snum
664 slversion crc16_t10dif_01, 01, 06, 0010