1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2 ; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
7 ; * Redistributions of source code must retain the above copyright
8 ; notice, this list of conditions and the following disclaimer.
9 ; * Redistributions in binary form must reproduce the above copyright
10 ; notice, this list of conditions and the following disclaimer in
11 ; the documentation and/or other materials provided with the
13 ; * Neither the name of Intel Corporation nor the names of its
14 ; contributors may be used to endorse or promote products derived
15 ; from this software without specific prior written permission.
17 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 ; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21 ; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22 ; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23 ; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24 ; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25 ; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
31 ; UINT16 crc16_t10dif_02(
32 ; UINT16 init_crc, //initial CRC value, 16 bits
33 ; const unsigned char *buf, //buffer pointer to calculate CRC on
34 ; UINT64 len //buffer length in bytes (64-bit data)
42 ; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
43 ; URL: http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
45 %include "reg_sizes.asm"
47 %define fetch_dist 1024
54 %ifidn __OUTPUT_FORMAT__, win64
59 %xdefine arg1_low32 ecx
65 %xdefine arg1_low32 edi
68 %ifidn __OUTPUT_FORMAT__, win64
70 %define VARIABLE_OFFSET 16*10+8
72 %define VARIABLE_OFFSET 16*2+8
76 global crc16_t10dif_02:ISAL_SYM_TYPE_FUNCTION
79 ; adjust the 16-bit initial_crc value, scale it to 32 bits
82 ; After this point, code flow is exactly same as a 32-bit CRC.
83 ; The only difference is before returning eax, we will shift it right 16 bits, to scale back to 16 bits.
85 sub rsp, VARIABLE_OFFSET
86 %ifidn __OUTPUT_FORMAT__, win64
87 ; push the xmm registers into the stack to maintain
88 vmovdqa [rsp+16*2],xmm6
89 vmovdqa [rsp+16*3],xmm7
90 vmovdqa [rsp+16*4],xmm8
91 vmovdqa [rsp+16*5],xmm9
92 vmovdqa [rsp+16*6],xmm10
93 vmovdqa [rsp+16*7],xmm11
94 vmovdqa [rsp+16*8],xmm12
95 vmovdqa [rsp+16*9],xmm13
98 ; check if smaller than 256
101 ; for sizes less than 256, we can't fold 128B at a time...
105 ; load the initial crc value
106 vmovd xmm10, arg1_low32 ; initial crc
108 ; crc value does not need to be byte-reflected, but it needs to be moved to the high part of the register.
109 ; because data will be byte-reflected and will align with initial crc at correct place.
112 vmovdqa xmm11, [SHUF_MASK]
113 ; receive the initial 128B data, xor the initial crc value
114 vmovdqu xmm0, [arg2+16*0]
115 vmovdqu xmm1, [arg2+16*1]
116 vmovdqu xmm2, [arg2+16*2]
117 vmovdqu xmm3, [arg2+16*3]
118 vmovdqu xmm4, [arg2+16*4]
119 vmovdqu xmm5, [arg2+16*5]
120 vmovdqu xmm6, [arg2+16*6]
121 vmovdqu xmm7, [arg2+16*7]
124 ; XOR the initial_crc value
134 vmovdqa xmm10, [rk3] ;xmm10 has rk3 and rk4
135 ;imm value of pclmulqdq instruction will determine which constant to use
136 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
137 ; we subtract 256 instead of 128 to save one instruction from the loop
140 ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The _fold_128_B_loop
141 ; loop will fold 128B at a time until we have 128+y Bytes of buffer
144 ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
147 ; update the buffer pointer
148 add arg2, 128 ; buf += 128;
150 prefetchnta [arg2+fetch_dist+0]
151 vmovdqu xmm9, [arg2+16*0]
152 vmovdqu xmm12, [arg2+16*1]
157 vpclmulqdq xmm0, xmm10, 0x0
158 vpclmulqdq xmm8, xmm10 , 0x11
159 vpclmulqdq xmm1, xmm10, 0x0
160 vpclmulqdq xmm13, xmm10 , 0x11
166 prefetchnta [arg2+fetch_dist+32]
167 vmovdqu xmm9, [arg2+16*2]
168 vmovdqu xmm12, [arg2+16*3]
173 vpclmulqdq xmm2, xmm10, 0x0
174 vpclmulqdq xmm8, xmm10 , 0x11
175 vpclmulqdq xmm3, xmm10, 0x0
176 vpclmulqdq xmm13, xmm10 , 0x11
182 prefetchnta [arg2+fetch_dist+64]
183 vmovdqu xmm9, [arg2+16*4]
184 vmovdqu xmm12, [arg2+16*5]
189 vpclmulqdq xmm4, xmm10, 0x0
190 vpclmulqdq xmm8, xmm10 , 0x11
191 vpclmulqdq xmm5, xmm10, 0x0
192 vpclmulqdq xmm13, xmm10 , 0x11
198 prefetchnta [arg2+fetch_dist+96]
199 vmovdqu xmm9, [arg2+16*6]
200 vmovdqu xmm12, [arg2+16*7]
205 vpclmulqdq xmm6, xmm10, 0x0
206 vpclmulqdq xmm8, xmm10 , 0x11
207 vpclmulqdq xmm7, xmm10, 0x0
208 vpclmulqdq xmm13, xmm10 , 0x11
216 ; check if there is another 128B in the buffer to be able to fold
218 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
222 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
223 ; fold the 8 xmm registers to 1 xmm register with different constants
227 vpclmulqdq xmm0, xmm10, 0x11
228 vpclmulqdq xmm8, xmm10, 0x0
232 vmovdqa xmm10, [rk11]
234 vpclmulqdq xmm1, xmm10, 0x11
235 vpclmulqdq xmm8, xmm10, 0x0
239 vmovdqa xmm10, [rk13]
241 vpclmulqdq xmm2, xmm10, 0x11
242 vpclmulqdq xmm8, xmm10, 0x0
246 vmovdqa xmm10, [rk15]
248 vpclmulqdq xmm3, xmm10, 0x11
249 vpclmulqdq xmm8, xmm10, 0x0
253 vmovdqa xmm10, [rk17]
255 vpclmulqdq xmm4, xmm10, 0x11
256 vpclmulqdq xmm8, xmm10, 0x0
260 vmovdqa xmm10, [rk19]
262 vpclmulqdq xmm5, xmm10, 0x11
263 vpclmulqdq xmm8, xmm10, 0x0
267 vmovdqa xmm10, [rk1] ;xmm10 has rk1 and rk2
268 ;imm value of pclmulqdq instruction will determine which constant to use
270 vpclmulqdq xmm6, xmm10, 0x11
271 vpclmulqdq xmm8, xmm10, 0x0
276 ; instead of 128, we add 112 to the loop counter to save 1 instruction from the loop
277 ; instead of a cmp instruction, we use the negative flag with the jl instruction
279 jl _final_reduction_for_128
281 ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
282 ; we can fold 16 bytes at a time if y>=16
283 ; continue folding 16B at a time
287 vpclmulqdq xmm7, xmm10, 0x11
288 vpclmulqdq xmm8, xmm10, 0x0
295 ; instead of a cmp instruction, we utilize the flags with the jge instruction
296 ; equivalent of: cmp arg3, 16-16
297 ; check if there is any more 16B in the buffer to be able to fold
298 jge _16B_reduction_loop
300 ;now we have 16+z bytes left to reduce, where 0<= z < 16.
301 ;first, we reduce the data in the xmm7 register
304 _final_reduction_for_128:
305 ; check if any more data to fold. If not, compute the CRC of the final 128 bits
309 ; here we are getting data that is less than 16 bytes.
310 ; since we know that there was data before the pointer, we can offset the input pointer before the actual point, to receive exactly 16 bytes.
311 ; after that the registers need to be adjusted.
315 vmovdqu xmm1, [arg2 - 16 + arg3]
318 ; get rid of the extra data that was loaded before
319 ; load the shift constant
320 lea rax, [pshufb_shf_table + 16]
324 ; shift xmm2 to the left by arg3 bytes
327 ; shift xmm7 to the right by 16-arg3 bytes
330 vpblendvb xmm1, xmm1, xmm2, xmm0
335 vpclmulqdq xmm7, xmm10, 0x11
336 vpclmulqdq xmm8, xmm10, 0x0
341 ; compute crc of a 128-bit value
342 vmovdqa xmm10, [rk5] ; rk5 and rk6 in xmm10
346 vpclmulqdq xmm7, xmm10, 0x1
356 vpclmulqdq xmm7, xmm10, 0x10
361 vmovdqa xmm10, [rk7] ; rk7 and rk8 in xmm10
363 vpclmulqdq xmm7, xmm10, 0x01
365 vpclmulqdq xmm7, xmm10, 0x11
372 ; scale the result back to 16 bits
374 %ifidn __OUTPUT_FORMAT__, win64
375 vmovdqa xmm6, [rsp+16*2]
376 vmovdqa xmm7, [rsp+16*3]
377 vmovdqa xmm8, [rsp+16*4]
378 vmovdqa xmm9, [rsp+16*5]
379 vmovdqa xmm10, [rsp+16*6]
380 vmovdqa xmm11, [rsp+16*7]
381 vmovdqa xmm12, [rsp+16*8]
382 vmovdqa xmm13, [rsp+16*9]
384 add rsp, VARIABLE_OFFSET
388 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
389 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
390 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
391 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
396 ; check if there is enough buffer to be able to fold 16B at a time
399 vmovdqa xmm11, [SHUF_MASK]
401 ; if there is, load the constants
402 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
404 vmovd xmm0, arg1_low32 ; get the initial crc value
405 vpslldq xmm0, 12 ; align it to its correct place
406 vmovdqu xmm7, [arg2] ; load the plaintext
407 vpshufb xmm7, xmm11 ; byte-reflect the plaintext
411 ; update the buffer pointer
414 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
417 jmp _16B_reduction_loop
422 ; mov initial crc to the return value. this is necessary for zero-length buffers.
427 vmovdqa xmm11, [SHUF_MASK]
429 vmovd xmm0, arg1_low32 ; get the initial crc value
430 vpslldq xmm0, 12 ; align it to its correct place
434 jl _less_than_16_left
436 vmovdqu xmm7, [arg2] ; load the plaintext
437 vpshufb xmm7, xmm11 ; byte-reflect the plaintext
438 vpxor xmm7, xmm0 ; xor the initial crc value
441 vmovdqa xmm10, [rk1] ; rk1 and rk2 in xmm10
442 jmp _get_last_two_xmms
447 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
456 ; backup the counter value
499 vpxor xmm7, xmm0 ; xor the initial crc value
501 lea rax, [pshufb_shf_table + 16]
513 vpxor xmm7, xmm0 ; xor the initial crc value
533 vpxor xmm7, xmm0 ; xor the initial crc value
551 vpxor xmm7, xmm0 ; xor the initial crc value
564 vpxor xmm7, xmm0 ; xor the initial crc value
572 ; precomputed constants
573 ; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
576 ; rk1 = 2^(32*3) mod Q << 32
577 ; rk2 = 2^(32*5) mod Q << 32
578 ; rk3 = 2^(32*15) mod Q << 32
579 ; rk4 = 2^(32*17) mod Q << 32
580 ; rk5 = 2^(32*3) mod Q << 32
581 ; rk6 = 2^(32*2) mod Q << 32
582 ; rk7 = floor(2^64/Q)
585 DQ 0x2d56000000000000
587 DQ 0x06df000000000000
589 DQ 0x9d9d000000000000
591 DQ 0x7cf5000000000000
593 DQ 0x2d56000000000000
595 DQ 0x1368000000000000
597 DQ 0x00000001f65a57f8
599 DQ 0x000000018bb70000
602 DQ 0xceae000000000000
604 DQ 0xbfd6000000000000
606 DQ 0x1e16000000000000
608 DQ 0x713c000000000000
610 DQ 0xf7f9000000000000
612 DQ 0x80a6000000000000
614 DQ 0x044c000000000000
616 DQ 0xe658000000000000
618 DQ 0xad18000000000000
620 DQ 0xa497000000000000
622 DQ 0x6ee3000000000000
624 DQ 0xe7b5000000000000
627 dq 0x8080808080808080, 0x8080808080808080
629 dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
632 dq 0x08090A0B0C0D0E0F, 0x0001020304050607
635 ; use these values for shift constants for the pshufb instruction
636 ; different alignments result in values as shown:
637 ; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
638 ; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
639 ; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
640 ; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
641 ; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
642 ; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
643 ; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
644 ; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
645 ; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
646 ; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
647 ; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
648 ; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
649 ; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
650 ; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
651 ; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
652 dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
653 dq 0x0706050403020100, 0x000e0d0c0b0a0908